# basic python libraries for data analysis and visulaization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
# Data Preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Model evaluation libraries
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
# # Supress Warnings
# import warnings
# warnings.filterwarnings('ignore')
n_RFID - balanced or imbalanced, rate of popularity etcmLogistic Regression model hyperparameters using cross validation roc_auc score and identifying best penalty - l1, l2, elasticnet
6.2 Tuning Random Forest Hyperparameters using cross validation roc_auc score
6.3 Tuning Gradient Boosting Regression Trees model Hyperparameters using cross validation roc_auc score roc_auc and precision-recall curves and interpreting the results for these three algorithms
7.1 Identifying optimum thresholds for these three alogirthms using maximum F1Score
7.2 Getting the metrics such as Accuracy, precision, recall at the above optimum thresholds and interpreting the resultsL1 penalty regression model results in sparse coefficient and therefore can be uased for feature selection
8.1 Identiying features that can impact popularity of charging pools# creating station_data dataframe and checking the data
station_data = pd.read_csv('station_popularity_GIS_data.csv')
print(station_data.shape)
station_data.head()
(1271, 173)
| n_RFID | LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 3144.980695 | 39.0 | 79.446522 | 107.613249 | 78.108788 | 77.960151 | 204.450310 | 93.790000 | 8.323677 | 0.966141 | 50.610927 | 85.317687 | 132.287005 | 109.842805 | 123.666054 | 118.686712 | 19.768732 | 1116.784735 | 113.112821 | 129.314264 | 38.0 | 35.0 | 13.0 | 1.0 | 0.0 | 12.0 | 36.0 | 40.0 | 11.0 | 8.918225 | 31.362425 | 212.625349 | 66.812369 | 62.427576 | 18.505317 | 4.607750 | 2244.419976 | 0.000000 | 1.364034 | 3544.642211 | 1.000000 | 1638.311759 | 28.188466 | 33.330191 | 21.580986 | 12.196073 | 69.536543 | 19.291177 | 3.477227 | 8.879537 | 6.038170 | 6.360384 | 4.598984 | 24.162395 | 1.399388 | 11.137684 | 8.908554 | 0.785170 | 1.370537 | 1.048537 | 0.499318 | 5.057340 | 0.000000 | 20.240945 | 185.011859 | 34.274255 | 37.450631 | 82.043265 | 63.689933 | 423.742710 | 119.158507 | 75.021998 | 29.301200 | 24.986647 | 42.417246 | 8.067214 | 11.147909 | 24.003322 | 22.034919 | 53.194286 | 13.692495 | 67.919130 | 7.060870 | 19.531526 | 86.089071 | 49.232364 | 82.713986 | 47.391787 | 203.209558 | 591.838929 | 0.623638 | 75.523806 | 32.292735 | 365.299250 | 456.856335 | 0.036556 | 0.068360 | 0.192702 | 0.353845 | 0.066074 | 0.006650 | 0.204401 | 0.000000 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.071412 | 0.0 | 0.059826 | 0.050547 | 0.187637 | -0.206754 | 0.015376 | 7.591785 | 970.619979 | 6.197301e+05 | 2245.270362 | 12434.407020 | 2.567969e+06 | 318.417820 | 97544.101099 | 2.454206e+07 | 32 | 1 | 0 | 10068.826915 | 4.125124e+06 | 161272.522475 | 28838.564401 | 2 | 6 | 2 | 22 | 1 | 11 | 28 | 3 | 4 | 0 | 7 | 3 | 4 | 8 | 2 | 198.179381 | 88.770255 | 72.366384 | 75.871040 | 210.271897 | 141.261612 | 87.693581 | 262.074867 | 24.759870 | 1277.344559 | 68.639374 | 186.027051 | 136.550848 | 99.968996 | 290.174346 | 3 | 142.887158 | 1 | 0 | 0 | 1 | 11.092 | 53.198155 | 5.796057 | 1 |
| 1 | 0 | 2146.749023 | 39.0 | 79.446522 | 107.613249 | 78.108788 | 77.960151 | 204.450310 | 93.790000 | 8.323677 | 0.966141 | 50.610927 | 85.317687 | 132.287005 | 109.842805 | 123.666054 | 118.686712 | 19.768732 | 1116.784735 | 113.112821 | 129.314264 | 38.0 | 35.0 | 13.0 | 1.0 | 0.0 | 12.0 | 36.0 | 40.0 | 11.0 | 8.918225 | 31.362425 | 212.625349 | 66.812369 | 62.427576 | 18.505317 | 4.607750 | 2244.419976 | 0.000000 | 1.000000 | 3548.292922 | 1.000000 | 2132.752409 | 14.722594 | 27.219441 | 24.896112 | 16.196574 | 55.222824 | 31.829637 | 4.619597 | 24.470683 | 11.233204 | 12.677494 | 11.956400 | 25.718570 | 1.956951 | 7.480254 | 4.405958 | 0.042948 | 0.793503 | 0.540559 | 0.000000 | 2.736555 | 0.000000 | 15.640753 | 20.009846 | 12.945879 | 26.653051 | 58.284834 | 35.072746 | 166.141415 | 156.063035 | 34.273338 | 61.461690 | 16.565706 | 21.471799 | 9.451185 | 7.720763 | 33.659852 | 26.243428 | 37.350627 | 24.990243 | 44.891906 | 21.798455 | 9.791412 | 79.498775 | 64.563273 | 49.033981 | 57.871081 | 345.260769 | 792.651378 | 0.790207 | 38.872819 | 60.626561 | 580.726140 | 630.399180 | 0.000000 | 0.034898 | 0.629784 | 0.000000 | 0.000000 | 0.072518 | 0.000000 | 0.088602 | 0.03811 | 0.084467 | 0.0 | 0.0 | 0.0 | 0.0 | 0.051622 | 0.0 | 0.084035 | 0.036869 | 0.113198 | 0.013265 | -0.017752 | 8.389262 | 1433.920212 | 1.339843e+06 | 2562.010753 | 16393.672695 | 5.925417e+05 | 79.181798 | 72137.168312 | 2.974309e+06 | 88 | 1 | 0 | 7610.783466 | 2.504443e+06 | 60965.909314 | 9868.050475 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 676.299283 | 490.620725 | 421.136547 | 81.022834 | 901.331089 | 506.682043 | 333.910853 | 565.045175 | 565.240364 | 1512.122355 | 533.452282 | 456.411343 | 474.838594 | 441.840959 | 553.492377 | 1 | 178.755375 | 1 | 0 | 0 | 1 | 10.776 | 53.207220 | 5.789140 | 0 |
| 2 | 1 | 3083.904465 | 39.0 | 79.446522 | 107.613249 | 78.108788 | 77.960151 | 204.450310 | 93.790000 | 8.323677 | 0.966141 | 50.610927 | 85.317687 | 132.287005 | 109.842805 | 123.666054 | 118.686712 | 19.768732 | 1116.784735 | 113.112821 | 129.314264 | 38.0 | 35.0 | 13.0 | 1.0 | 0.0 | 12.0 | 36.0 | 40.0 | 11.0 | 8.918225 | 31.362425 | 212.625349 | 66.812369 | 62.427576 | 18.505317 | 4.607750 | 2244.419976 | 0.000000 | 1.072310 | 3499.014095 | 1.000182 | 1332.338437 | 30.599564 | 35.310522 | 19.100997 | 10.534623 | 72.336703 | 17.682940 | 2.546584 | 8.712747 | 5.028682 | 2.313197 | 2.226874 | 23.315388 | 1.370745 | 12.628220 | 9.169181 | 0.832371 | 1.157473 | 0.869065 | 0.806174 | 4.986439 | 0.000000 | 18.954586 | 212.029429 | 38.412631 | 40.978704 | 83.528615 | 56.194007 | 454.346325 | 124.491884 | 79.100143 | 24.621555 | 21.115167 | 51.673400 | 10.416500 | 11.235388 | 24.526174 | 22.762070 | 53.139467 | 14.864760 | 67.814754 | 7.435279 | 18.836296 | 51.558878 | 41.568803 | 51.637357 | 50.252440 | 142.192273 | 469.136081 | 0.554363 | 65.179432 | 30.080457 | 299.525834 | 350.666497 | 0.044744 | 0.067333 | 0.094466 | 0.333196 | 0.124522 | 0.033220 | 0.211865 | 0.000000 | 0.00000 | 0.000267 | 0.0 | 0.0 | 0.0 | 0.0 | 0.090385 | 0.0 | 0.066618 | 0.049091 | 0.195517 | -0.228675 | 0.034794 | 7.662257 | 978.286960 | 5.047840e+05 | 2229.489204 | 16451.780219 | 3.033503e+06 | 320.658231 | 131943.034982 | 3.117645e+07 | 126 | 0 | 0 | 9528.576243 | 4.240101e+06 | 163689.570985 | 30158.096875 | 1 | 7 | 3 | 31 | 1 | 11 | 31 | 3 | 4 | 0 | 7 | 5 | 3 | 7 | 2 | 177.927891 | 135.445340 | 244.346032 | 44.314617 | 222.497041 | 64.179074 | 34.177035 | 245.063878 | 75.458816 | 1287.173228 | 26.487560 | 105.221622 | 75.902557 | 81.913939 | 119.419777 | 2 | 13.177880 | 1 | 0 | 0 | 2 | 11.212 | 53.198649 | 5.792690 | 1 |
| 3 | 1 | 2611.341132 | 39.0 | 79.446522 | 107.613249 | 78.108788 | 77.960151 | 204.450310 | 93.790000 | 8.323677 | 0.966141 | 50.610927 | 85.317687 | 132.287005 | 109.842805 | 123.666054 | 118.686712 | 19.768732 | 1116.784735 | 113.112821 | 129.314264 | 38.0 | 35.0 | 13.0 | 1.0 | 0.0 | 12.0 | 36.0 | 40.0 | 11.0 | 8.918225 | 31.362425 | 212.625349 | 66.812369 | 62.427576 | 18.505317 | 4.607750 | 2244.419976 | 0.000000 | 1.000000 | 3677.687957 | 1.000000 | 1692.500527 | 20.644185 | 30.320092 | 23.320874 | 15.522466 | 61.560081 | 25.745422 | 4.314614 | 11.485993 | 7.034459 | 9.308847 | 6.041023 | 25.046745 | 1.609011 | 10.047831 | 5.176712 | 0.212391 | 0.583454 | 0.397680 | 0.441344 | 2.781891 | 0.000000 | 17.239019 | 160.984766 | 24.527347 | 26.102998 | 68.691344 | 59.136469 | 357.418399 | 137.007280 | 57.042030 | 39.493862 | 19.991848 | 38.449873 | 2.876190 | 15.112537 | 28.030782 | 23.841705 | 44.510158 | 18.301749 | 58.081237 | 12.472553 | 15.227671 | 43.307598 | 60.445545 | 61.114029 | 53.382335 | 252.376541 | 589.533400 | 0.609827 | 106.339598 | 43.597440 | 420.971372 | 467.109899 | 0.000000 | 0.049503 | 0.276142 | 0.257814 | 0.046540 | 0.139354 | 0.000000 | 0.000000 | 0.00000 | 0.112080 | 0.0 | 0.0 | 0.0 | 0.0 | 0.118568 | 0.0 | 0.097494 | 0.057370 | 0.178214 | -0.133152 | 0.009663 | 8.281939 | 1234.535176 | 8.768413e+05 | 2347.706830 | 7359.988858 | 1.782436e+06 | 286.016790 | 39855.689865 | 1.331024e+07 | 84 | 5 | 1 | 8198.343764 | 4.662696e+06 | 142454.919466 | 23481.798463 | 1 | 5 | 2 | 24 | 0 | 3 | 16 | 0 | 1 | 0 | 2 | 7 | 2 | 2 | 1 | 329.491497 | 55.207178 | 98.409048 | 65.640163 | 430.925211 | 98.946876 | 127.251956 | 386.189179 | 111.572517 | 1287.545519 | 26.452639 | 62.276035 | 62.744309 | 78.480907 | 105.372561 | 0 | 395.388444 | 1 | 0 | 0 | 1 | 11.180 | 53.202734 | 5.790615 | 1 |
| 4 | 0 | 1190.876169 | 42.0 | 10.798414 | 49.008186 | 55.653364 | 70.051249 | 219.013984 | 107.153492 | 12.736591 | 1.938177 | 13.567238 | 59.806600 | 149.239618 | 133.457321 | 103.554020 | 88.879253 | 20.489298 | 1051.876267 | 121.551377 | 80.849663 | 32.0 | 40.0 | 15.0 | 1.0 | 1.0 | 26.0 | 31.0 | 31.0 | 8.0 | 0.830647 | 26.580711 | 193.263921 | 63.406071 | 77.250192 | 24.365652 | 6.091413 | 10189.272603 | 0.276882 | 1.019625 | 1556.705202 | 2.421492 | 1184.994055 | 10.220098 | 23.417249 | 28.463144 | 23.770335 | 45.625873 | 37.734874 | 7.328232 | 6.898565 | 6.251394 | 12.651866 | 9.510295 | 30.638528 | 1.962430 | 9.762691 | 5.643084 | 0.220098 | 0.000000 | 0.000000 | 1.214361 | 3.428722 | 0.399879 | 19.706842 | 63.682493 | 6.085821 | 14.199438 | 19.911352 | 16.664438 | 140.650262 | 140.010374 | 50.656814 | 49.540272 | 32.386947 | 17.264214 | 16.741331 | 6.229971 | 24.638290 | 20.358360 | 46.955918 | 12.593554 | 53.678421 | 10.050169 | 13.356911 | 95.778471 | 37.366625 | 46.843581 | 54.254519 | 279.333490 | 519.618336 | 0.874940 | 59.252601 | 38.142045 | 400.662875 | 444.057721 | 0.037307 | 0.039405 | 0.485840 | 0.116722 | 0.031985 | 0.007434 | 0.275099 | 0.006207 | 0.00000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.026081 | 0.019500 | -0.127069 | 0.033582 | 0.035875 | 6.585049 | 1264.074246 | 6.305359e+05 | 2891.933294 | 6389.216979 | 6.344701e+05 | 140.345160 | 41876.903651 | 5.192583e+06 | 400 | 22 | 3 | 6795.810569 | 8.429442e+05 | 46140.560269 | 7142.474634 | 0 | 0 | 1 | 1 | 0 | 3 | 3 | 0 | 1 | 0 | 2 | 1 | 1 | 3 | 0 | 445.887557 | 10200.211880 | 168.213810 | 316.860886 | 646.370717 | 210.390743 | 67.794601 | 499.227070 | 339.812093 | 439.364346 | 306.795638 | 327.779076 | 207.219962 | 178.023886 | 385.383072 | 0 | 410.464343 | 0 | 1 | 0 | 1 | 11.124 | 51.969393 | 6.714074 | 1 |
# using describe function to understand mean, median (50%), standard deviation and max volume of each colums
station_data.describe()
| n_RFID | LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1.271000e+03 | 1271.000000 | 1271.000000 | 1.271000e+03 | 1271.000000 | 1.271000e+03 | 1.271000e+03 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1.271000e+03 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.00000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 | 1271.000000 |
| mean | 0.254131 | 1396.054167 | 40.085391 | 28.813428 | 74.713836 | 64.091585 | 70.769719 | 238.687800 | 119.741316 | 11.881574 | 1.514142 | 18.628799 | 79.259727 | 144.152048 | 119.242654 | 111.102161 | 109.967415 | 18.471155 | 1066.199778 | 135.072704 | 178.545693 | 33.512568 | 39.364808 | 14.808826 | 1.092354 | 0.655204 | 15.575702 | 35.494132 | 34.553355 | 12.228009 | 14.860482 | 7.731859 | 200.855976 | 75.396117 | 80.590070 | 25.248988 | 7.214198 | 21750.671100 | 0.668912 | 1.352465 | 1750.059447 | 2.723106 | 1520.002065 | 12.050120 | 25.128802 | 27.477270 | 20.006029 | 47.365824 | 38.900955 | 5.963936 | 14.211549 | 8.557164 | 12.914627 | 9.901123 | 29.528996 | 2.103431 | 9.997545 | 9.776584 | 1.489305 | 0.736649 | 1.534431 | 1.584522 | 4.233981 | 0.748021 | 16.245520 | 40.798893 | 12.285041 | 14.812291 | 42.398832 | 21.272813 | 149.112729 | 213.714005 | 38.379379 | 55.758395 | 27.964949 | 14.867027 | 15.052130 | 5.740593 | 31.069361 | 24.682719 | 39.067513 | 21.209757 | 40.119593 | 20.566341 | 9.288570 | 68.034411 | 37.868755 | 41.638722 | 58.189770 | 279.924375 | 640.957455 | 3.570880 | 60.646673 | 50.082406 | 437.302876 | 529.577640 | 0.012379 | 0.053805 | 0.514013 | 0.080404 | 0.011632 | 0.032601 | 0.080793 | 0.005894 | 0.019122 | 0.041759 | 0.026555 | 0.002056 | 0.057153 | 0.012070 | 0.034529 | 0.003576 | 0.009266 | 0.014071 | 0.007924 | 0.008055 | 0.010405 | 6.997852 | 1247.564872 | 6.794550e+05 | 2997.989922 | 12108.920631 | 5.029768e+05 | 71.947850 | 5.950048e+04 | 4.704024e+06 | 185.450039 | 7.165224 | 2.197482 | 7050.037619 | 1.581150e+06 | 65040.437387 | 35971.674971 | 0.332022 | 0.889064 | 0.763179 | 4.191975 | 0.185681 | 4.021243 | 5.920535 | 1.391817 | 0.808025 | 1.051928 | 1.011802 | 0.575924 | 0.84343 | 2.215578 | 1.133753 | 1583.809084 | 959.656672 | 917.570211 | 312.699289 | 3183.352209 | 1219.617840 | 283.440415 | 743.152057 | 693.489630 | 1505.028654 | 745.839413 | 826.345745 | 713.695649 | 339.837575 | 630.083094 | 0.878836 | 633.802881 | 0.732494 | 0.102282 | 0.143194 | 1.695515 | 10.276625 | 52.066366 | 5.226565 | 0.589300 |
| std | 0.435543 | 973.783270 | 2.591093 | 36.327040 | 55.176233 | 36.524251 | 30.177542 | 94.116421 | 46.855747 | 4.824986 | 1.050476 | 12.463524 | 42.475639 | 46.232757 | 38.843690 | 49.415118 | 61.005076 | 11.978802 | 341.090590 | 83.629001 | 185.735846 | 5.145582 | 3.795535 | 2.510264 | 0.449919 | 1.251375 | 5.533738 | 4.021137 | 5.058866 | 3.193721 | 99.011050 | 25.023571 | 69.404786 | 29.895966 | 30.248218 | 10.158108 | 4.504075 | 39070.769927 | 6.082647 | 0.788273 | 1216.271660 | 1.226830 | 1039.407839 | 5.121082 | 6.743132 | 4.840447 | 7.760780 | 9.675007 | 8.630692 | 3.033286 | 13.229492 | 4.667424 | 10.253800 | 8.226814 | 5.707924 | 0.332998 | 4.816562 | 9.719693 | 2.654876 | 1.261010 | 2.611890 | 2.797986 | 3.299756 | 2.011574 | 11.594567 | 45.743107 | 12.726678 | 13.361111 | 45.484910 | 24.525246 | 135.146534 | 67.573247 | 24.786279 | 15.895035 | 13.801981 | 10.725023 | 18.357928 | 5.196792 | 6.612831 | 4.643533 | 6.795057 | 7.835680 | 13.125617 | 9.810430 | 4.675836 | 53.263063 | 28.423250 | 66.714351 | 6.363948 | 173.685492 | 410.832499 | 26.088895 | 85.732721 | 29.013068 | 241.623536 | 297.575982 | 0.036494 | 0.045439 | 0.240002 | 0.112186 | 0.032662 | 0.057717 | 0.151976 | 0.018367 | 0.056941 | 0.058378 | 0.069889 | 0.012012 | 0.111558 | 0.050837 | 0.064845 | 0.017126 | 0.059487 | 0.049135 | 0.115552 | 0.102443 | 0.058682 | 1.159280 | 406.788159 | 4.308274e+05 | 676.603755 | 23117.435018 | 5.632521e+05 | 81.178810 | 1.074804e+05 | 6.139980e+06 | 335.544562 | 15.543926 | 7.155496 | 2011.431881 | 1.587678e+06 | 84692.812056 | 90056.231455 | 0.967132 | 1.950037 | 1.343903 | 7.817261 | 0.528092 | 13.763996 | 9.944768 | 2.336778 | 1.443737 | 3.230096 | 1.575107 | 0.978680 | 1.31072 | 2.873848 | 1.456255 | 1367.868107 | 1271.548831 | 1482.184662 | 368.156049 | 3907.867818 | 1827.574842 | 365.158896 | 1163.230097 | 814.897711 | 2073.305392 | 1072.830353 | 895.070665 | 1001.574077 | 390.310195 | 1084.519655 | 1.544954 | 1023.014358 | 0.442833 | 0.303138 | 0.350409 | 0.630688 | 2.058611 | 0.477536 | 0.721957 | 0.492155 |
| min | 0.000000 | 26.618981 | 27.000000 | 0.000000 | 0.000000 | 0.154593 | 0.357551 | 0.857741 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.668240 | 0.593437 | 0.304199 | 0.000000 | 0.000000 | 4.493166 | 0.357551 | 0.000000 | 7.000000 | 30.000000 | 5.000000 | 0.000000 | 0.000000 | 2.403294 | 15.000000 | 9.613177 | 3.000000 | -186.960128 | -178.025586 | 0.802885 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 32.134190 | 0.000000 | 1.000000 | 48.790848 | 1.000000 | 0.177309 | 0.000000 | 6.000000 | 1.000000 | 0.000000 | 27.863188 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.235838 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.990559 | 92.057844 | 0.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 0.939332 | 11.600000 | 11.100000 | 12.000000 | 3.000000 | 7.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 20.000000 | 0.000000 | 6.177267 | 0.003402 | 1.459540 | 0.000000 | 2.389303 | 3.842820 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.139068 | -0.343620 | -0.701644 | -0.353011 | -0.292269 | 2.470007 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 4.558638e+03 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1347.238489 | 2.989008e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 13.513766 | 8.183483 | 14.410388 | 8.286994 | 28.035039 | 7.726437 | 2.217077 | 8.825757 | 9.737658 | 9.811746 | 6.264985 | 2.988848 | 15.952530 | 1.591057 | 4.219574 | 0.000000 | 1.733718 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.617000 | 50.770992 | 3.382141 | 0.000000 |
| 25% | 0.000000 | 678.049784 | 39.000000 | 5.924619 | 34.334630 | 38.977558 | 50.433254 | 179.104530 | 89.850914 | 8.437765 | 0.855925 | 9.895561 | 49.164804 | 120.096902 | 97.803029 | 77.887973 | 70.945548 | 10.745000 | 877.818738 | 74.050265 | 44.337413 | 30.000000 | 37.000000 | 13.000000 | 1.000000 | 0.000000 | 11.000000 | 33.000000 | 31.000000 | 10.000000 | -32.638228 | -4.044457 | 160.780805 | 55.863607 | 62.427576 | 18.534128 | 4.474987 | 2113.991762 | 0.000000 | 1.000000 | 886.939302 | 1.965333 | 831.589282 | 9.857091 | 20.759821 | 24.792557 | 14.389066 | 41.001353 | 34.418809 | 3.976415 | 5.362173 | 6.328666 | 5.359993 | 5.050840 | 25.722069 | 1.881118 | 6.980487 | 3.457180 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 9.060545 | 14.154475 | 4.498004 | 6.111089 | 15.460217 | 7.994040 | 66.196857 | 167.784072 | 17.522799 | 45.861709 | 19.000000 | 7.269560 | 5.000000 | 3.000000 | 26.703306 | 21.616788 | 35.000000 | 15.175172 | 30.470886 | 13.059934 | 6.000000 | 30.260519 | 17.911586 | 8.913951 | 54.811470 | 155.377307 | 399.756195 | 0.823978 | 34.113675 | 30.567217 | 269.799932 | 327.284581 | 0.000000 | 0.029291 | 0.351585 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -0.029143 | -0.001639 | -0.051881 | -0.050089 | -0.017178 | 6.086574 | 1060.040244 | 3.896538e+05 | 2605.571926 | 4674.491382 | 1.518724e+05 | 22.417596 | 2.341378e+04 | 1.171730e+06 | 12.000000 | 0.000000 | 0.000000 | 5776.302501 | 6.403544e+05 | 17780.403545 | 3374.186240 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 460.434767 | 226.005065 | 220.366050 | 98.633334 | 574.971320 | 193.307034 | 75.054179 | 173.556039 | 212.419524 | 281.995861 | 185.764362 | 244.487807 | 201.559822 | 106.063836 | 148.835629 | 0.000000 | 191.767907 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 10.747500 | 51.806228 | 4.648421 | 0.000000 |
| 50% | 0.000000 | 1221.967672 | 40.000000 | 14.148009 | 57.400238 | 57.341121 | 69.328149 | 229.577164 | 116.436821 | 11.630724 | 1.341993 | 15.586808 | 72.408863 | 143.736783 | 119.100200 | 108.154607 | 101.434870 | 17.777041 | 1107.488308 | 124.653900 | 124.312747 | 34.000000 | 39.000000 | 15.000000 | 1.000000 | 0.000000 | 15.000000 | 36.000000 | 34.000000 | 12.000000 | 4.861560 | 1.486710 | 203.453632 | 74.136383 | 80.179393 | 24.918964 | 6.090449 | 5780.638189 | 0.000000 | 1.000000 | 1507.788836 | 2.616494 | 1332.082678 | 11.000000 | 23.480174 | 27.904754 | 19.749973 | 45.000000 | 41.181770 | 5.665706 | 10.607063 | 8.000000 | 10.737457 | 8.124825 | 29.910939 | 2.121266 | 9.109062 | 6.924300 | 0.693625 | 0.416013 | 0.926343 | 0.766838 | 3.120966 | 0.004432 | 13.724326 | 24.152866 | 8.297609 | 10.649090 | 27.763546 | 13.726954 | 109.010812 | 204.968528 | 32.958822 | 58.000000 | 26.637303 | 11.691872 | 10.044577 | 4.629371 | 29.805849 | 23.932740 | 38.958922 | 20.314535 | 38.884662 | 19.000000 | 8.000000 | 55.603631 | 31.933414 | 20.825806 | 58.633529 | 260.372491 | 607.579550 | 1.014327 | 49.009188 | 46.243644 | 422.442589 | 505.030443 | 0.000000 | 0.043202 | 0.555960 | 0.033348 | 0.000000 | 0.000000 | 0.007512 | 0.000000 | 0.000000 | 0.013314 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.003015 | 0.000000 | 0.001233 | 0.027688 | 0.004460 | 0.029138 | 0.013022 | 7.000000 | 1262.537548 | 6.302017e+05 | 3034.255950 | 6657.135535 | 3.240430e+05 | 41.109474 | 3.459669e+04 | 2.583941e+06 | 48.000000 | 1.000000 | 0.000000 | 7075.723227 | 1.130324e+06 | 40636.837216 | 11339.451456 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 1.000000 | 1.000000 | 1223.062088 | 526.665732 | 427.679683 | 198.587474 | 1567.792122 | 485.410313 | 171.259815 | 361.173607 | 442.473305 | 689.351068 | 404.032566 | 536.600030 | 390.905125 | 237.107384 | 292.455750 | 0.000000 | 336.086532 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 11.031000 | 52.066079 | 5.190485 | 1.000000 |
| 75% | 1.000000 | 1904.806749 | 41.000000 | 39.427395 | 101.815099 | 79.109554 | 83.607335 | 315.247225 | 150.489715 | 15.053064 | 1.938177 | 23.912723 | 102.816977 | 173.475691 | 142.013673 | 145.690105 | 132.097743 | 23.067884 | 1284.654228 | 175.656497 | 210.445702 | 37.000000 | 42.000000 | 16.000000 | 1.000000 | 1.000000 | 19.000000 | 38.000000 | 39.000000 | 14.000000 | 33.748310 | 13.630793 | 240.605540 | 93.875971 | 99.847753 | 30.932003 | 9.558949 | 24042.088337 | 0.000000 | 1.280882 | 2227.045193 | 3.888474 | 2005.471886 | 12.753615 | 28.455944 | 30.257354 | 24.799817 | 50.607516 | 45.093851 | 7.564412 | 18.418407 | 10.264421 | 17.710618 | 12.515441 | 33.778782 | 2.313367 | 11.997426 | 12.224600 | 1.955648 | 1.000000 | 1.494500 | 1.958977 | 5.051077 | 0.875762 | 20.327849 | 50.255153 | 14.807339 | 18.808442 | 48.794514 | 23.176048 | 175.608798 | 243.491404 | 56.763542 | 67.371264 | 35.000000 | 19.421913 | 17.055007 | 7.043127 | 33.800724 | 26.607331 | 42.934071 | 25.611580 | 48.905799 | 27.000000 | 11.623951 | 94.242695 | 52.005361 | 53.239910 | 62.003613 | 380.287620 | 825.401058 | 1.189517 | 70.165745 | 65.701032 | 577.472022 | 703.767112 | 0.000000 | 0.064737 | 0.700163 | 0.124281 | 0.000000 | 0.047112 | 0.092714 | 0.000838 | 0.002061 | 0.066366 | 0.016425 | 0.000000 | 0.060487 | 0.000000 | 0.043594 | 0.000000 | 0.046291 | 0.044128 | 0.065133 | 0.084314 | 0.039807 | 7.992497 | 1465.070921 | 8.988739e+05 | 3390.799871 | 10983.812112 | 6.318876e+05 | 90.540143 | 5.726475e+04 | 5.834274e+06 | 182.000000 | 8.000000 | 1.000000 | 8274.204816 | 1.986072e+06 | 77454.000025 | 27621.902970 | 0.000000 | 1.000000 | 1.000000 | 5.000000 | 0.000000 | 2.000000 | 8.000000 | 2.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 1.00000 | 3.000000 | 2.000000 | 2336.595922 | 1111.156341 | 848.617115 | 385.415231 | 4492.187072 | 1287.586192 | 366.400371 | 781.776161 | 873.155941 | 1755.650031 | 846.951591 | 1061.579377 | 746.845341 | 434.682134 | 594.125264 | 1.000000 | 599.747472 | 1.000000 | 0.000000 | 0.000000 | 2.000000 | 11.216000 | 52.305982 | 5.773413 | 1.000000 |
| max | 1.000000 | 7615.979870 | 51.000000 | 219.088799 | 241.789566 | 166.326603 | 164.851232 | 765.007677 | 341.675485 | 29.951383 | 8.181900 | 76.283892 | 236.055840 | 299.676717 | 308.729635 | 396.822164 | 256.007375 | 132.001720 | 2417.172315 | 879.228407 | 678.847461 | 54.000000 | 52.000000 | 42.320936 | 5.000000 | 14.000000 | 39.000000 | 50.000000 | 51.000000 | 41.000000 | 1287.577307 | 126.226168 | 375.544258 | 165.385474 | 235.053348 | 72.043016 | 51.212349 | 413401.896201 | 155.477696 | 5.997844 | 7779.164539 | 5.000000 | 7097.636310 | 70.000000 | 67.000000 | 53.389596 | 63.064368 | 100.000000 | 56.229404 | 33.012579 | 88.033298 | 83.782230 | 81.327781 | 119.629756 | 52.228628 | 3.219340 | 52.000000 | 82.579195 | 34.434845 | 18.000000 | 21.960289 | 30.492943 | 24.034340 | 41.865950 | 113.802592 | 409.163549 | 115.507780 | 109.606157 | 319.663937 | 193.907783 | 1041.862854 | 647.425696 | 100.000000 | 97.292829 | 83.139024 | 86.000000 | 100.000000 | 86.982342 | 78.900957 | 50.996088 | 86.000000 | 53.589353 | 94.000000 | 65.232373 | 36.257458 | 422.641359 | 203.681664 | 855.960692 | 87.000000 | 1954.832441 | 8379.589329 | 507.359826 | 2175.151288 | 186.579791 | 1469.810378 | 3311.915023 | 0.371367 | 0.450322 | 0.971081 | 0.814097 | 0.399478 | 0.459302 | 1.000000 | 0.372095 | 0.796863 | 0.355383 | 0.779464 | 0.231654 | 0.855582 | 0.682348 | 0.560780 | 0.244233 | 0.209611 | 0.112041 | 0.571980 | 0.196548 | 0.257000 | 9.000000 | 2777.022398 | 3.013102e+06 | 5810.076591 | 352722.719077 | 5.339439e+06 | 510.289453 | 1.914631e+06 | 6.662351e+07 | 3603.000000 | 185.000000 | 130.000000 | 13551.341713 | 1.229032e+07 | 703726.302175 | 948933.956062 | 10.000000 | 24.000000 | 19.000000 | 116.000000 | 4.000000 | 266.000000 | 161.000000 | 29.000000 | 15.000000 | 57.000000 | 10.000000 | 7.000000 | 9.00000 | 23.000000 | 14.000000 | 7801.642796 | 11423.176225 | 12198.593631 | 3936.541289 | 27146.382779 | 15274.475354 | 4502.806909 | 12558.946006 | 7043.125512 | 20136.032144 | 9035.220338 | 7575.759809 | 7915.549624 | 4770.061453 | 10872.320313 | 16.000000 | 9911.466543 | 1.000000 | 1.000000 | 1.000000 | 12.000000 | 12.000000 | 53.443334 | 7.205534 | 1.000000 |
station_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1271 entries, 0 to 1270 Columns: 173 entries, n_RFID to CP_type dtypes: float64(148), int64(25) memory usage: 1.7 MB
n_RFID- Popularity of a charging pool (1/0)¶The response variable is n_RFID (popularity of charging pool)
station_data['n_RFID'].value_counts()
0 948 1 323 Name: n_RFID, dtype: int64
so there are 948 unpopular stations and 323 popular stations
n_popular = station_data['n_RFID'].value_counts()[1]
n_unpopular = station_data['n_RFID'].value_counts()[0]
len_df = station_data.shape[0]
# printing number of popular and unpopular and percentage of popular chargin pools
print('Total number of popular charing pools are {}'.format(n_popular))
print('Total number of unpopular charging pools are {}'.format(n_unpopular))
print('Percentage of popular charging pools are {}%'.format(round(n_popular/len_df*100, 2)))
# plotting
plt.figure(figsize = (8,6))
sns.countplot(station_data['n_RFID'])
plt.title('Count Plot of charging pools - Popular(1) vs unpopular(0)')
plt.show()
Total number of popular charing pools are 323 Total number of unpopular charging pools are 948 Percentage of popular charging pools are 25.41%
# checking the columns for objects, strings datta types
len(station_data.select_dtypes(include='number').columns)
173
All the columns are numerical data type and doesnt require any one hot encoding or dummy creation
# Checking for null values
station_data.isnull().sum().sum()
0
There are no null values in the provided data
# Checking the cols with low unique values
cat_cols = station_data.nunique()[station_data.nunique()<10].index.to_list()
for col in cat_cols:
print('understanding the value counts for category column {}'.format(col))
print(station_data[col].value_counts())
# removing response variable from cat_cols
cat_cols.remove('n_RFID')
understanding the value counts for category column n_RFID 0 948 1 323 Name: n_RFID, dtype: int64 understanding the value counts for category column PC31 1.000000 1125 2.000000 87 0.000000 31 3.000000 21 5.000000 2 4.000000 2 4.323037 1 3.204392 1 0.854006 1 Name: PC31, dtype: int64 understanding the value counts for category column n.family 0 1094 1 137 2 24 3 13 4 3 Name: n.family, dtype: int64 understanding the value counts for category column n.public 0 815 1 286 2 108 3 37 4 12 5 9 7 2 6 2 Name: n.public, dtype: int64 understanding the value counts for category column RoadType_residential 1 931 0 340 Name: RoadType_residential, dtype: int64 understanding the value counts for category column RoadType_secondary 0 1141 1 130 Name: RoadType_secondary, dtype: int64 understanding the value counts for category column RoadType_tertiary 0 1089 1 182 Name: RoadType_tertiary, dtype: int64 understanding the value counts for category column npoint 2 834 1 424 4 8 3 3 12 1 10 1 Name: npoint, dtype: int64 understanding the value counts for category column CP_type 1 749 0 522 Name: CP_type, dtype: int64
# defining functions for num_plot which would be used for plotting univariate analysis and bi-variate analysis graphs
# to understand the data
def num_plot(col):
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (14,6))
sns.boxplot(x = 'n_RFID', y = col, data = station_data, ax = ax1)
sns.distplot(station_data[col], ax = ax2)
ax1.set_title('{} vs popularity of charging pools'.format(col))
ax2.set_title('Distribution plot of {}'.format(col))
plt.show()
# plotting univariate and bi-variate analysis graph based on the above function
for col in station_data.columns:
if col not in ['n_RFID'] and col not in cat_cols:
try:
num_plot(col)
except RuntimeError as re:
if str(re) == 'Selected KDE bandwidth is 0. Cannot estiamte density.':
sns.distplot(station_data, kde_kws={'bw': 0.1})
else:
raise re
# for cat_cols which is having low unique values we can use different function for plotting univariate and bivariate analysis graphs
def cat_plot(col):
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (14,6))
rev_data = station_data[['n_RFID', col]].groupby(col).mean()['n_RFID']
sns.barplot(x= rev_data.index, y= rev_data.values, ax = ax1)
sns.countplot(station_data[col], ax = ax2)
ax1.set_title('{} vs popularity of charging pools'.format(col))
ax2.set_title('Distribution plot of {}'.format(col))
plt.close(0)
plt.show()
# plotting univariate and bi-variate analysis graph based on the above function
for col in cat_cols:
cat_plot(col)
# identifying columns that are highly correlated with the response variable
# using corr function to estimate and corrleation between columns and selecting correlations between `n_RFID` and other columns
# selecting 0.3 and -0.3 as thresholds on either side of zeros as minimum correlation required
station_corr = station_data.corr()['n_RFID'][(station_data.corr()['n_RFID'] > 0.3) | (station_data.corr()['n_RFID'] < -0.3)].sort_values(ascending = False)
# ignoring first row since its corellation with 'n_RFID' itself
station_corr = station_corr[1:]
plt.figure(figsize = (12,10))
sns.barplot( x = station_corr.index, y = station_corr.values)
plt.title('correlation of major columns with popularity')
plt.show()
# Train and test split of the data
evci_train, evci_test = train_test_split(station_data, test_size = 0.2, random_state = 0, stratify = station_data['n_RFID'])
# scaling the data
scaled_cols = evci_train.columns.to_list()
scaled_cols.remove('n_RFID')
scaler = StandardScaler()
# fitting and transforming train data
evci_train[scaled_cols] = scaler.fit_transform(evci_train[scaled_cols])
# transforming the test data
evci_test[scaled_cols] = scaler.transform(evci_test[scaled_cols])
C:\Users\raviprasad\anaconda3\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy C:\Users\raviprasad\anaconda3\lib\site-packages\pandas\core\indexing.py:965: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[item] = s C:\Users\raviprasad\anaconda3\lib\site-packages\ipykernel_launcher.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy # This is added back by InteractiveShellApp.init_path() C:\Users\raviprasad\anaconda3\lib\site-packages\pandas\core\indexing.py:965: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[item] = s
print(evci_train.shape)
evci_train.head()
(1016, 173)
| n_RFID | LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1262 | 0 | -0.376875 | -0.434262 | -0.733065 | -1.027260 | -1.215038 | -0.978980 | -0.453521 | -0.459945 | -1.022952 | -0.348182 | -0.955437 | -1.067853 | -0.761427 | -0.761935 | -1.314335 | -1.067746 | -0.729443 | -0.589163 | -1.091799 | -0.838296 | -0.494664 | -0.083954 | 0.477949 | -0.217521 | 1.077365 | 0.471656 | -1.164613 | 0.683494 | -0.697807 | 0.299874 | -0.368939 | -1.061176 | -0.877638 | -0.611911 | -0.101369 | 0.782613 | 0.162315 | -0.106579 | -0.441860 | -0.818817 | 1.063137 | -0.659484 | -0.011835 | -0.469936 | 0.107061 | -0.248029 | -0.666275 | 1.175575 | -0.310689 | -0.371456 | 0.495093 | -0.530666 | -0.091078 | 0.796591 | 1.193151 | -1.244306 | -0.790912 | -0.552017 | -0.590769 | -0.581621 | -0.564977 | -0.677306 | 1.039392 | -0.436775 | -0.572880 | -0.573530 | -0.523166 | -0.659569 | -0.589988 | -0.659232 | 0.033875 | -1.123871 | 1.153789 | -0.945807 | -0.558837 | 0.168965 | -0.521526 | -0.089355 | -0.390362 | -0.008005 | 0.230086 | -1.005183 | 0.558439 | -1.119511 | -0.799133 | -0.851959 | -0.540803 | 1.072600 | -0.798892 | -0.527447 | -0.097598 | -0.090609 | -0.313090 | -0.594529 | -0.622255 | -0.343064 | -0.571922 | 0.916368 | -0.329035 | -0.353318 | -0.580070 | -0.121893 | -0.304053 | -0.338388 | -0.240360 | -0.379747 | -0.172305 | 0.433626 | -0.228307 | -0.525841 | -0.212142 | -0.116096 | 0.827089 | -1.155199 | 1.378326 | 0.480242 | 0.862566 | 0.078199 | -0.661574 | 0.058103 | -0.244783 | -0.881627 | -0.875898 | 0.205653 | -0.189136 | -0.310730 | -0.384869 | -0.299824 | -0.542030 | -0.678605 | -0.592346 | -0.391972 | -0.347119 | -0.464583 | -0.540650 | -0.287928 | -0.351213 | -0.288095 | -0.198761 | -0.572551 | -0.541194 | -0.320211 | 0.00000 | -0.588072 | 0.089748 | -0.432462 | -0.787589 | 0.755117 | -0.199932 | -0.005085 | -0.096130 | 0.459335 | 1.828874 | -0.617712 | -0.169228 | -0.012589 | 1.386519 | -0.584940 | 3.188998 | -0.520886 | -0.015039 | 2.025458 | -0.558663 | -0.015291 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.309752 | -1.230843 | -1.780691 | 0.853165 |
| 13 | 0 | 0.346231 | -0.827311 | 1.147768 | 2.490096 | 2.728308 | 1.340875 | 0.914969 | 0.608591 | 1.434642 | 1.449416 | 1.701539 | 2.139729 | 0.193386 | -0.064525 | 1.159011 | 2.223665 | 0.284614 | 0.182361 | 1.962437 | 2.421862 | 1.641925 | -1.134829 | -1.134013 | -0.217521 | -0.512201 | -1.572024 | 1.122079 | -0.312337 | 1.770543 | -0.633591 | 1.360551 | 1.195655 | 1.185450 | 0.297275 | 0.311186 | 1.072135 | -0.538755 | -0.099055 | -0.441860 | -0.274537 | 0.296562 | -0.429647 | -0.213905 | -0.908903 | 0.729981 | 0.137992 | -0.151302 | 0.251552 | 0.016002 | -0.677386 | -0.412036 | -0.222759 | 0.022486 | -0.434958 | 0.293295 | 1.236690 | 0.022765 | -0.186681 | 0.217227 | 0.189263 | -0.564977 | 0.838697 | -0.349333 | -0.206103 | -0.319152 | -0.146064 | 0.844171 | -0.018543 | -0.092288 | -0.090699 | 2.498314 | -0.470199 | -0.602621 | 1.019162 | -0.276257 | -0.665324 | 0.041909 | 2.182018 | 2.079884 | -0.458539 | 1.258868 | -0.316826 | 1.174269 | -0.064213 | -0.621718 | -0.843968 | -0.407988 | -0.041333 | -0.325459 | -0.432752 | -0.111328 | -0.209493 | -0.593013 | -0.562200 | -0.437725 | -0.343064 | -0.676098 | 0.464252 | -0.282183 | -0.353318 | -0.580070 | -0.389625 | -0.304053 | -0.338388 | 0.421271 | 0.952816 | -0.172305 | -0.504475 | 1.044525 | 0.287735 | -0.211844 | 0.424450 | 0.106430 | 1.091191 | 0.318018 | 0.342494 | 1.732163 | 0.870142 | -0.124882 | 0.291830 | -0.197132 | -0.310242 | -0.337743 | -0.087284 | -0.167133 | -0.301724 | -0.384869 | -0.299824 | -0.712913 | -0.592993 | -0.549030 | -0.296038 | 0.643535 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.104747 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.110609 | -1.055597 | -0.404178 | -0.095513 | 0.376731 | -0.722153 | -0.141373 | -0.388376 | 0.224409 | -0.018129 | -0.239387 | 0.524961 | 0.391409 | 0.241521 | -0.021650 | -0.477313 | 0.068521 | -0.492116 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.333904 | 0.506513 | -0.531288 | -1.172106 |
| 451 | 1 | 2.493564 | -0.434262 | 0.708571 | 1.472991 | 1.642536 | 1.516160 | 0.915795 | 1.028120 | 1.386009 | 2.025219 | 1.786051 | 1.201272 | 0.731935 | 0.549572 | 0.752266 | 2.309080 | 0.378128 | 0.179609 | 0.915217 | 2.550026 | 1.059219 | -1.134829 | -0.731022 | -0.217521 | -0.512201 | -0.643078 | 0.868003 | -0.113171 | 0.227825 | -0.898734 | 0.285017 | 1.277983 | 1.314188 | 0.313945 | 0.462885 | 1.320011 | -0.540741 | -0.106579 | 2.842768 | 4.066697 | -1.403041 | 4.459411 | 1.276537 | 1.305388 | -1.011122 | -1.535925 | 1.856617 | -1.726546 | -1.292263 | 4.548387 | 0.753823 | 1.377398 | -0.743927 | -1.843575 | -0.550742 | 0.715107 | 4.377667 | 3.844228 | 1.844132 | 3.130121 | 3.728318 | 3.567515 | 0.593542 | 2.834711 | 1.392573 | 3.114151 | 0.953030 | 3.756520 | 4.510421 | 3.177587 | -1.199392 | 2.343358 | -1.663843 | 1.060510 | 1.107588 | -0.439271 | 0.301718 | -1.085644 | -1.341852 | 2.165048 | -0.937869 | 1.718677 | -1.001765 | 3.518357 | 3.080167 | 4.596095 | 6.950661 | -1.538706 | 1.349705 | 1.828088 | -0.126373 | 0.867889 | 0.860175 | 3.056263 | 2.032822 | -0.343064 | -0.085005 | 0.645512 | 0.329514 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | 2.078130 | -0.379747 | -0.172305 | -0.489356 | -0.228307 | -0.525841 | -0.212142 | 1.977285 | -3.696352 | 2.056307 | -3.472640 | -3.943662 | -2.598421 | -0.653086 | 2.812150 | -0.926455 | -0.397507 | 1.004334 | 3.112293 | -0.339471 | 0.200917 | -0.184638 | -0.322345 | -0.299824 | 1.297429 | 0.254001 | -0.201176 | -0.265698 | 1.634190 | 0.047921 | 0.886104 | 2.082697 | -0.351213 | -0.088614 | 0.835389 | -0.572551 | -0.541194 | -0.034328 | 0.00000 | -0.588072 | -0.645610 | -0.432462 | 1.920332 | -0.964831 | -0.520275 | -0.600736 | -0.788713 | -0.519976 | -0.547699 | -0.616941 | -0.342736 | -0.392294 | -0.637820 | -0.568486 | -0.318679 | -0.105735 | 0.006372 | -0.487169 | 0.068521 | -0.421691 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.420681 | -0.333245 | -1.078104 | -1.172106 |
| 213 | 1 | -1.333306 | -0.041213 | -0.139675 | -0.386104 | -0.811294 | -0.997883 | -1.323397 | -1.350550 | -1.072245 | -0.805541 | -0.576453 | -0.635939 | -1.519862 | -1.291135 | -1.130948 | -0.875473 | -0.835906 | -1.501298 | -0.579037 | -0.404439 | 1.059219 | -0.609391 | -0.731022 | -0.217521 | -0.512201 | 0.471656 | 0.105772 | -0.710669 | 0.536368 | -0.109961 | 0.120097 | -1.265981 | -1.230259 | -1.340518 | -1.383138 | -1.036745 | -0.535100 | -0.106579 | -0.441860 | -0.548972 | 0.358617 | -1.448596 | 0.162656 | -0.999083 | -1.723015 | 2.022200 | -0.159741 | -1.036646 | 3.139599 | -1.063112 | -1.731836 | -1.107227 | 0.200015 | -0.894925 | -0.352849 | 0.157481 | -0.706905 | -0.281706 | 0.007065 | -0.581621 | -0.564977 | -0.875464 | -0.356954 | -1.355512 | -0.859285 | -0.942717 | -1.089183 | -0.909992 | -0.788635 | -1.050811 | 0.616379 | 1.123958 | 0.275542 | 0.000685 | -0.277191 | 0.502675 | 1.918059 | 1.892380 | 2.123650 | -1.659964 | 2.030455 | -0.775731 | 0.968992 | -0.908451 | -1.274443 | -1.325999 | -0.612749 | 1.072600 | -0.754863 | -1.488832 | -0.102442 | -0.640539 | -1.740243 | -1.796184 | -1.754622 | -0.343064 | -0.532750 | -2.129447 | -0.704780 | 3.148029 | -0.386412 | -0.526733 | -0.304053 | -0.338388 | 0.585245 | 4.540421 | -0.172305 | 2.617119 | 0.434499 | -0.045494 | -0.212142 | 0.296491 | 0.812956 | 0.495469 | 0.443729 | 1.153321 | 1.732163 | -1.786032 | -1.587029 | -0.638419 | 0.875330 | -0.504528 | -0.818373 | 1.707675 | -0.444059 | -0.523886 | -0.447394 | -0.299824 | -2.568101 | -0.674916 | -0.693060 | -0.349169 | 0.643535 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.480802 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | 0.825107 | -0.790556 | -0.787589 | -0.984855 | -0.429036 | 0.214156 | 1.202957 | -0.435910 | -0.040269 | -0.265066 | 0.312924 | 0.027894 | -0.155159 | 0.340708 | -0.211193 | -0.385930 | 0.973401 | 0.339802 | -0.558663 | 0.084432 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.484057 | -1.378501 | 0.351005 | 0.853165 |
| 921 | 0 | 0.292738 | -0.434262 | 1.072062 | 0.741784 | 0.625757 | 0.671227 | 0.825040 | 1.036845 | 0.675418 | 0.211673 | 0.566650 | 0.759986 | 0.814770 | 0.816954 | 0.476718 | 0.312447 | 1.283588 | 1.234851 | 0.788347 | 0.080222 | 0.088042 | -0.346673 | 0.074959 | -0.217521 | 0.282582 | -0.643078 | -0.910536 | 1.280992 | 0.227825 | -0.781189 | 0.614512 | 0.848564 | 0.709720 | 0.831727 | 1.233370 | 0.995932 | -0.499276 | -0.101049 | -0.438342 | -0.311962 | 0.043695 | -0.029610 | -0.531976 | -0.777542 | 0.086109 | 0.454281 | -0.496675 | 0.852328 | 0.065922 | -0.060778 | 0.086157 | 0.164981 | 0.349999 | 0.960072 | 0.835855 | 1.376237 | -0.427176 | -0.360073 | -0.590769 | -0.398565 | -0.564977 | -0.199532 | -0.356764 | -0.461651 | -0.460951 | -0.261538 | -0.623374 | -0.104677 | -0.442607 | -0.390977 | 1.620289 | -0.851569 | 1.126315 | -0.989840 | -0.761142 | -0.543080 | 0.453873 | 1.705720 | 1.482873 | -1.279789 | 1.765168 | -1.353010 | 2.293500 | -0.792857 | -0.865635 | -0.702514 | -0.481056 | 0.479503 | 0.439295 | 0.057746 | -0.105930 | -0.354115 | 0.409283 | 0.125725 | 0.227759 | -0.343064 | -1.175885 | 1.208728 | -0.704780 | -0.353318 | 0.495369 | -0.179093 | -0.304053 | -0.338388 | -0.092912 | -0.379747 | -0.172305 | -0.124541 | -0.228307 | -0.512947 | -0.212142 | -0.599622 | 0.581007 | 0.035580 | 1.082245 | -0.238004 | 1.072814 | 0.938116 | 0.573782 | 0.474107 | -0.133128 | -0.506398 | -0.526381 | -0.234971 | -0.519621 | -0.499869 | -0.447394 | -0.299824 | 0.153655 | -0.837037 | -0.715651 | -0.402808 | -0.347119 | -0.464583 | 0.172727 | -0.406460 | -0.351213 | -0.288095 | -0.574816 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.790556 | -0.787589 | -0.585490 | 0.076210 | -0.392713 | -0.074123 | 0.118853 | -0.248215 | 0.426960 | -0.173645 | 0.077570 | 0.526518 | -0.187158 | -0.068680 | 0.089658 | 0.642597 | -0.240095 | -0.558663 | -0.322573 | 0.57735 | -0.324884 | -0.396454 | -1.160948 | -0.640143 | 0.109839 | -1.105747 | -1.172106 |
print(evci_test.shape)
evci_test.head()
(255, 173)
| n_RFID | LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 474 | 0 | 1.385493 | -2.399508 | -0.346248 | -0.194383 | -0.359774 | -1.222376 | 0.759633 | 0.865120 | -0.843432 | 0.872437 | -0.104773 | 0.083361 | -0.505148 | -1.580653 | 0.931676 | 1.059036 | -0.378286 | -0.556895 | 0.042534 | 1.313890 | 0.282277 | 0.704202 | -1.537003 | -2.397418 | -0.512201 | -1.014657 | 2.392464 | -0.511503 | -0.080719 | 1.890325 | -0.257700 | -0.475179 | 0.833635 | 0.747846 | 0.451576 | 0.355114 | -0.507277 | -0.106579 | 1.581523 | 0.316702 | -0.575528 | 0.282661 | -0.091944 | 2.153073 | -1.127246 | -1.332576 | 1.335690 | -1.769028 | -0.939916 | 1.155862 | 1.259505 | 0.044393 | -0.203563 | -1.605337 | -0.701638 | 0.747716 | 3.524989 | 2.035930 | 2.632563 | 4.774262 | 0.201430 | 3.785533 | -0.356954 | -0.277704 | 0.780637 | 0.761263 | -0.225815 | 0.070559 | 0.164488 | 0.318992 | -1.179226 | 2.030119 | -1.649110 | 1.718603 | 0.439031 | 0.697295 | -0.370397 | -0.725969 | -0.883958 | 0.271890 | -1.000673 | 1.165478 | -1.262454 | 2.532117 | 0.733908 | 0.965560 | 1.506331 | -0.360430 | -0.609078 | -0.295753 | -0.121081 | -0.103373 | -0.389231 | -0.311819 | -0.405792 | 1.142598 | 0.016514 | -0.378581 | -0.562776 | 0.870442 | 0.136265 | 1.726332 | -0.304053 | 0.013333 | -0.709919 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.334878 | -0.212142 | -0.334733 | -3.536731 | 0.855640 | -2.124485 | -0.481144 | -2.161406 | -3.096809 | -1.623904 | -1.063760 | -0.539568 | -0.881627 | -0.873059 | 0.125790 | 1.729959 | -0.283710 | -0.384869 | -0.299824 | 2.591719 | 0.182261 | -0.414179 | -0.218632 | -0.347119 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.386788 | -0.572551 | 0.777398 | -0.320211 | 0.63724 | -0.588072 | 0.089748 | -0.790556 | -0.787589 | -0.752716 | -0.357531 | -0.133646 | 0.169502 | -0.691148 | -0.486591 | 0.142687 | -0.192833 | -0.514057 | -0.547208 | -0.419477 | -0.218916 | -0.422628 | 0.352248 | -0.221062 | 2.577259 | -0.438672 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.405080 | 0.647056 | -0.010754 | -1.172106 |
| 689 | 1 | 0.132886 | -0.041213 | -0.392623 | -0.097146 | -0.119379 | 0.143255 | 0.128098 | 0.001848 | 0.714554 | -0.094832 | 0.123772 | 0.024586 | -0.026420 | -0.123013 | -0.236104 | -0.141902 | -0.115631 | -0.039409 | -0.043573 | 0.119606 | 0.282277 | -0.083954 | -0.731022 | -0.217521 | -0.512201 | 1.214812 | 0.105772 | -0.909835 | -0.697807 | -0.202242 | 0.108104 | -0.060378 | 0.035188 | 0.020613 | 0.239545 | 0.617957 | -0.320450 | -0.106579 | 0.533363 | -0.087843 | -0.264038 | 0.219948 | -0.350678 | 0.107037 | -1.273507 | 1.233254 | -0.499338 | -0.100931 | 1.238093 | -0.034620 | -0.357882 | 2.113637 | 0.913580 | -0.484079 | -0.646400 | 0.187132 | 1.485733 | 1.771568 | -0.033457 | -0.076353 | 2.984021 | 0.739015 | -0.356954 | -0.301246 | -0.414598 | -0.395858 | -0.809200 | -0.507789 | -0.568990 | -0.550978 | -1.127138 | 1.628552 | -1.499576 | 2.339874 | -0.704773 | -0.389973 | -0.333866 | -1.042833 | -1.075091 | 0.868528 | -1.430041 | 1.050378 | -1.260567 | 0.690657 | 1.014206 | 0.026863 | 0.410587 | -1.143206 | 1.361387 | 0.009016 | -0.115838 | -0.225619 | -0.505027 | 0.159478 | 0.054719 | -0.343064 | 4.080203 | -0.703647 | -0.509289 | -0.353318 | -0.422533 | 0.294709 | -0.304053 | -0.338388 | 2.030185 | 0.706421 | -0.172305 | -0.504475 | -0.228307 | -0.203674 | -0.212142 | -1.760677 | -1.562642 | -0.604648 | -0.487629 | -0.960498 | -2.227927 | -0.316296 | -0.027328 | -0.194021 | 0.061774 | -0.061894 | -0.397579 | -0.036544 | -0.139982 | -0.262695 | -0.259821 | -0.299824 | 0.348573 | 2.950707 | 4.952980 | 6.620105 | 0.643535 | -0.464583 | -0.540650 | -0.406460 | -0.351213 | -0.288095 | -0.574816 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.787589 | -1.131845 | -0.028598 | -0.041061 | 0.018786 | -0.580485 | -0.403013 | 0.510581 | -0.110971 | 1.226111 | -0.329402 | 0.063737 | 1.295281 | -0.304193 | -0.731759 | 0.282867 | 0.068521 | -0.291365 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.579609 | -0.481419 | -0.382129 | 0.853165 |
| 762 | 0 | 0.274165 | -0.434262 | 0.672099 | 0.373530 | 0.068740 | -0.108572 | -0.144142 | -0.272081 | -0.608567 | -0.279078 | 1.033732 | 0.458187 | -0.189864 | -0.301142 | -0.106470 | -0.132561 | 0.978449 | 0.280274 | -0.487172 | -0.278404 | 0.864983 | -1.134829 | -0.731022 | -0.217521 | -0.512201 | -0.643078 | -0.656459 | 1.679324 | -0.389263 | -0.868952 | 0.016581 | 0.069309 | -0.152959 | -0.271381 | -0.060541 | -0.367069 | -0.473314 | -0.106579 | -0.441860 | 0.068699 | -0.580982 | 0.652307 | 0.060867 | -0.236880 | 1.013560 | -1.214081 | -0.162654 | 0.755373 | -1.260794 | 0.498659 | 0.190771 | -0.628252 | -0.808637 | 0.300692 | 1.551343 | -0.643267 | -0.318371 | -0.552017 | -0.357067 | -0.311645 | -0.311220 | -0.087531 | -0.356954 | -0.590167 | -0.514682 | 0.009634 | -0.666845 | -0.241531 | -0.030480 | -0.361393 | -0.153401 | -1.334129 | 1.310526 | -0.628830 | -0.992723 | -0.780449 | -0.881216 | -0.075033 | -0.445567 | -0.715210 | 0.239950 | -1.362829 | 0.574208 | -0.613176 | 0.311412 | 0.677485 | -0.117287 | 0.724664 | -0.360832 | 0.489167 | -0.106981 | -0.179128 | 0.800638 | 0.770560 | 0.446160 | -0.343064 | -0.026166 | 1.318029 | -0.704780 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | 0.751202 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.069956 | -0.212142 | -0.175023 | 0.688715 | 0.022506 | 0.240039 | 0.150951 | -0.007030 | 0.070602 | 0.800065 | 0.777829 | -0.377377 | -0.728021 | -0.576353 | -0.391534 | -0.657331 | -0.472849 | -0.384869 | -0.299824 | 0.912240 | 0.113386 | -0.317989 | -0.198978 | -0.347119 | -0.464583 | 1.599481 | -0.169397 | -0.351213 | -0.288095 | -0.386788 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.787589 | 0.883417 | -0.299866 | -0.549252 | -0.489816 | -0.668062 | -0.427217 | -0.620804 | -0.210990 | -0.281822 | -0.492968 | -0.258918 | -0.287242 | -0.197771 | 0.112170 | -0.114927 | -0.558663 | -0.085336 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.282452 | 0.879671 | 1.253738 | 0.853165 |
| 643 | 0 | 1.868870 | -1.220360 | -0.078303 | 0.203117 | 0.319325 | -0.037439 | 1.187290 | 0.486956 | -0.114936 | -0.214240 | 0.146557 | 0.795869 | 0.187726 | -0.256279 | 0.748040 | 0.353922 | 0.280693 | 0.715206 | 0.126977 | 0.330898 | 0.670748 | 0.178765 | -1.134013 | -0.217521 | -0.512201 | -0.828868 | 0.359849 | 0.484328 | 0.536368 | 0.861198 | -0.491152 | 0.322308 | 0.608754 | 1.002160 | 1.115269 | 0.756467 | -0.474619 | -0.106579 | -0.441860 | 0.386371 | -0.580982 | 1.324373 | -0.092421 | -0.437297 | 1.180614 | -0.827805 | -0.167285 | 0.264858 | -0.649357 | 0.271094 | -0.264450 | 0.303831 | -0.532646 | -0.496096 | 0.896299 | 0.029815 | 0.052766 | -0.168600 | 0.583411 | -0.128560 | 0.194773 | 0.115613 | -0.356954 | -0.544723 | -0.414192 | 1.161450 | 0.583416 | 0.776308 | 0.256924 | 0.259242 | 0.029305 | -0.571955 | 0.552790 | 0.061111 | -0.742850 | -0.832163 | -0.720504 | 0.677571 | 0.446384 | -1.077100 | 1.096281 | -1.036073 | 1.098041 | -0.622521 | 1.012636 | 1.249635 | -0.136817 | 0.944867 | 0.687185 | 1.245970 | -0.107757 | -0.238476 | 1.430515 | 1.475673 | 1.421358 | -0.343064 | -0.227459 | 1.601398 | -0.455591 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | -0.709919 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.102065 | -0.212142 | 1.365701 | 0.083473 | 0.042540 | -0.200460 | 0.262544 | 0.476049 | -0.497447 | 1.295388 | 0.232059 | -0.404588 | -0.706794 | -0.567830 | -0.397305 | -0.609971 | -0.493864 | -0.447394 | -0.299824 | 2.265221 | -0.520443 | -0.592368 | -0.375263 | -0.347119 | -0.464583 | 0.886104 | -0.406460 | -0.351213 | -0.088614 | -0.386788 | 0.665133 | 0.118102 | 0.251554 | -0.63724 | -0.588072 | 0.089748 | -0.790556 | -0.110609 | 1.322266 | -0.325619 | -0.442310 | 0.046489 | -0.718301 | -0.506466 | 0.143444 | -0.485155 | -0.454565 | -0.580648 | -0.381361 | 2.222107 | -0.405291 | 0.110645 | -0.279268 | -0.558663 | -0.549434 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.185924 | 0.247814 | 0.236190 | -1.172106 |
| 879 | 0 | 0.622219 | -1.220360 | -0.333556 | -0.382118 | -0.428163 | -1.013656 | 0.311666 | 0.398070 | -0.602314 | 0.146683 | -0.579369 | -0.082761 | -0.211206 | -0.990895 | 0.274536 | 0.014553 | -0.690715 | -0.283854 | 0.181804 | 0.277059 | -0.106193 | 0.441483 | -0.328032 | -0.217521 | -0.512201 | -0.457289 | 1.884310 | -1.109001 | 0.227825 | 1.903725 | -0.007359 | -0.429727 | 0.406385 | 0.698645 | 0.043006 | -0.273587 | -0.425612 | -0.106579 | -0.441860 | 0.335682 | -0.580982 | 0.001828 | -0.318994 | 0.911241 | -1.135950 | 0.151536 | -0.006044 | -0.152190 | 0.045786 | 0.119548 | 0.608943 | -0.246758 | -0.225640 | 0.060664 | -0.337586 | 1.023680 | 1.736465 | 1.442399 | 0.211964 | 1.143692 | 0.490138 | 2.635818 | -0.356954 | -0.330265 | 0.471469 | 0.688596 | 0.239106 | 0.356206 | -0.155482 | 0.302824 | -0.004716 | 1.122884 | -0.517099 | -0.825543 | 1.969773 | 0.977166 | -0.145083 | 0.317820 | 0.373763 | -1.175636 | 0.253101 | -0.254740 | -0.174544 | -0.385120 | -0.297596 | -0.141505 | -0.240642 | 0.518731 | 0.220608 | 0.177022 | -0.109245 | 0.021891 | -0.510581 | -0.073655 | 0.170581 | -0.343064 | -0.459951 | 0.357647 | 0.113948 | -0.353318 | 0.417687 | -0.201344 | 2.404405 | -0.337093 | 1.118426 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.358149 | -0.212142 | 0.026202 | -1.531519 | 0.301673 | -0.368334 | 0.234265 | -0.294029 | -0.249076 | -0.079489 | -0.495554 | -0.246980 | 0.265076 | 0.245991 | 0.225630 | 1.479221 | -0.487860 | -0.447394 | -0.299824 | 0.068677 | 0.103356 | 0.128147 | -0.201832 | -0.347119 | -0.464583 | -0.540650 | 0.304728 | 1.517018 | 0.576321 | 0.459334 | 0.665133 | 0.118102 | 0.823319 | 0.63724 | -0.588072 | -0.645610 | 2.432290 | -0.110609 | -0.338805 | -0.369323 | -0.389935 | -0.076465 | -0.754845 | -0.522836 | -0.019840 | -0.379604 | -0.483652 | -0.578653 | -0.416227 | 0.035632 | -0.214148 | -0.314149 | -0.336948 | 1.322890 | -0.483763 | 0.57735 | -0.324884 | -0.396454 | -1.160948 | -0.649893 | 0.483688 | -0.748047 | -1.172106 |
## Treating outliers for the train data
plt.figure(figsize = (16,8))
evci_train[scaled_cols].boxplot()
plt.xticks(rotation = 90)
plt.show()
# outlier treatment for the train dataset
evci_train2 = evci_train[~(evci_train>10).any(axis = 1)]
print(evci_train2.shape)
print(evci_train2.shape[0]/evci_train.shape[0])
(972, 173) 0.9566929133858267
X_train = evci_train2.drop('n_RFID', axis =1)
y_train = evci_train2['n_RFID']
X_train.head()
| LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1262 | -0.376875 | -0.434262 | -0.733065 | -1.027260 | -1.215038 | -0.978980 | -0.453521 | -0.459945 | -1.022952 | -0.348182 | -0.955437 | -1.067853 | -0.761427 | -0.761935 | -1.314335 | -1.067746 | -0.729443 | -0.589163 | -1.091799 | -0.838296 | -0.494664 | -0.083954 | 0.477949 | -0.217521 | 1.077365 | 0.471656 | -1.164613 | 0.683494 | -0.697807 | 0.299874 | -0.368939 | -1.061176 | -0.877638 | -0.611911 | -0.101369 | 0.782613 | 0.162315 | -0.106579 | -0.441860 | -0.818817 | 1.063137 | -0.659484 | -0.011835 | -0.469936 | 0.107061 | -0.248029 | -0.666275 | 1.175575 | -0.310689 | -0.371456 | 0.495093 | -0.530666 | -0.091078 | 0.796591 | 1.193151 | -1.244306 | -0.790912 | -0.552017 | -0.590769 | -0.581621 | -0.564977 | -0.677306 | 1.039392 | -0.436775 | -0.572880 | -0.573530 | -0.523166 | -0.659569 | -0.589988 | -0.659232 | 0.033875 | -1.123871 | 1.153789 | -0.945807 | -0.558837 | 0.168965 | -0.521526 | -0.089355 | -0.390362 | -0.008005 | 0.230086 | -1.005183 | 0.558439 | -1.119511 | -0.799133 | -0.851959 | -0.540803 | 1.072600 | -0.798892 | -0.527447 | -0.097598 | -0.090609 | -0.313090 | -0.594529 | -0.622255 | -0.343064 | -0.571922 | 0.916368 | -0.329035 | -0.353318 | -0.580070 | -0.121893 | -0.304053 | -0.338388 | -0.240360 | -0.379747 | -0.172305 | 0.433626 | -0.228307 | -0.525841 | -0.212142 | -0.116096 | 0.827089 | -1.155199 | 1.378326 | 0.480242 | 0.862566 | 0.078199 | -0.661574 | 0.058103 | -0.244783 | -0.881627 | -0.875898 | 0.205653 | -0.189136 | -0.310730 | -0.384869 | -0.299824 | -0.542030 | -0.678605 | -0.592346 | -0.391972 | -0.347119 | -0.464583 | -0.540650 | -0.287928 | -0.351213 | -0.288095 | -0.198761 | -0.572551 | -0.541194 | -0.320211 | 0.00000 | -0.588072 | 0.089748 | -0.432462 | -0.787589 | 0.755117 | -0.199932 | -0.005085 | -0.096130 | 0.459335 | 1.828874 | -0.617712 | -0.169228 | -0.012589 | 1.386519 | -0.584940 | 3.188998 | -0.520886 | -0.015039 | 2.025458 | -0.558663 | -0.015291 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.309752 | -1.230843 | -1.780691 | 0.853165 |
| 13 | 0.346231 | -0.827311 | 1.147768 | 2.490096 | 2.728308 | 1.340875 | 0.914969 | 0.608591 | 1.434642 | 1.449416 | 1.701539 | 2.139729 | 0.193386 | -0.064525 | 1.159011 | 2.223665 | 0.284614 | 0.182361 | 1.962437 | 2.421862 | 1.641925 | -1.134829 | -1.134013 | -0.217521 | -0.512201 | -1.572024 | 1.122079 | -0.312337 | 1.770543 | -0.633591 | 1.360551 | 1.195655 | 1.185450 | 0.297275 | 0.311186 | 1.072135 | -0.538755 | -0.099055 | -0.441860 | -0.274537 | 0.296562 | -0.429647 | -0.213905 | -0.908903 | 0.729981 | 0.137992 | -0.151302 | 0.251552 | 0.016002 | -0.677386 | -0.412036 | -0.222759 | 0.022486 | -0.434958 | 0.293295 | 1.236690 | 0.022765 | -0.186681 | 0.217227 | 0.189263 | -0.564977 | 0.838697 | -0.349333 | -0.206103 | -0.319152 | -0.146064 | 0.844171 | -0.018543 | -0.092288 | -0.090699 | 2.498314 | -0.470199 | -0.602621 | 1.019162 | -0.276257 | -0.665324 | 0.041909 | 2.182018 | 2.079884 | -0.458539 | 1.258868 | -0.316826 | 1.174269 | -0.064213 | -0.621718 | -0.843968 | -0.407988 | -0.041333 | -0.325459 | -0.432752 | -0.111328 | -0.209493 | -0.593013 | -0.562200 | -0.437725 | -0.343064 | -0.676098 | 0.464252 | -0.282183 | -0.353318 | -0.580070 | -0.389625 | -0.304053 | -0.338388 | 0.421271 | 0.952816 | -0.172305 | -0.504475 | 1.044525 | 0.287735 | -0.211844 | 0.424450 | 0.106430 | 1.091191 | 0.318018 | 0.342494 | 1.732163 | 0.870142 | -0.124882 | 0.291830 | -0.197132 | -0.310242 | -0.337743 | -0.087284 | -0.167133 | -0.301724 | -0.384869 | -0.299824 | -0.712913 | -0.592993 | -0.549030 | -0.296038 | 0.643535 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.104747 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.110609 | -1.055597 | -0.404178 | -0.095513 | 0.376731 | -0.722153 | -0.141373 | -0.388376 | 0.224409 | -0.018129 | -0.239387 | 0.524961 | 0.391409 | 0.241521 | -0.021650 | -0.477313 | 0.068521 | -0.492116 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.333904 | 0.506513 | -0.531288 | -1.172106 |
| 451 | 2.493564 | -0.434262 | 0.708571 | 1.472991 | 1.642536 | 1.516160 | 0.915795 | 1.028120 | 1.386009 | 2.025219 | 1.786051 | 1.201272 | 0.731935 | 0.549572 | 0.752266 | 2.309080 | 0.378128 | 0.179609 | 0.915217 | 2.550026 | 1.059219 | -1.134829 | -0.731022 | -0.217521 | -0.512201 | -0.643078 | 0.868003 | -0.113171 | 0.227825 | -0.898734 | 0.285017 | 1.277983 | 1.314188 | 0.313945 | 0.462885 | 1.320011 | -0.540741 | -0.106579 | 2.842768 | 4.066697 | -1.403041 | 4.459411 | 1.276537 | 1.305388 | -1.011122 | -1.535925 | 1.856617 | -1.726546 | -1.292263 | 4.548387 | 0.753823 | 1.377398 | -0.743927 | -1.843575 | -0.550742 | 0.715107 | 4.377667 | 3.844228 | 1.844132 | 3.130121 | 3.728318 | 3.567515 | 0.593542 | 2.834711 | 1.392573 | 3.114151 | 0.953030 | 3.756520 | 4.510421 | 3.177587 | -1.199392 | 2.343358 | -1.663843 | 1.060510 | 1.107588 | -0.439271 | 0.301718 | -1.085644 | -1.341852 | 2.165048 | -0.937869 | 1.718677 | -1.001765 | 3.518357 | 3.080167 | 4.596095 | 6.950661 | -1.538706 | 1.349705 | 1.828088 | -0.126373 | 0.867889 | 0.860175 | 3.056263 | 2.032822 | -0.343064 | -0.085005 | 0.645512 | 0.329514 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | 2.078130 | -0.379747 | -0.172305 | -0.489356 | -0.228307 | -0.525841 | -0.212142 | 1.977285 | -3.696352 | 2.056307 | -3.472640 | -3.943662 | -2.598421 | -0.653086 | 2.812150 | -0.926455 | -0.397507 | 1.004334 | 3.112293 | -0.339471 | 0.200917 | -0.184638 | -0.322345 | -0.299824 | 1.297429 | 0.254001 | -0.201176 | -0.265698 | 1.634190 | 0.047921 | 0.886104 | 2.082697 | -0.351213 | -0.088614 | 0.835389 | -0.572551 | -0.541194 | -0.034328 | 0.00000 | -0.588072 | -0.645610 | -0.432462 | 1.920332 | -0.964831 | -0.520275 | -0.600736 | -0.788713 | -0.519976 | -0.547699 | -0.616941 | -0.342736 | -0.392294 | -0.637820 | -0.568486 | -0.318679 | -0.105735 | 0.006372 | -0.487169 | 0.068521 | -0.421691 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.420681 | -0.333245 | -1.078104 | -1.172106 |
| 213 | -1.333306 | -0.041213 | -0.139675 | -0.386104 | -0.811294 | -0.997883 | -1.323397 | -1.350550 | -1.072245 | -0.805541 | -0.576453 | -0.635939 | -1.519862 | -1.291135 | -1.130948 | -0.875473 | -0.835906 | -1.501298 | -0.579037 | -0.404439 | 1.059219 | -0.609391 | -0.731022 | -0.217521 | -0.512201 | 0.471656 | 0.105772 | -0.710669 | 0.536368 | -0.109961 | 0.120097 | -1.265981 | -1.230259 | -1.340518 | -1.383138 | -1.036745 | -0.535100 | -0.106579 | -0.441860 | -0.548972 | 0.358617 | -1.448596 | 0.162656 | -0.999083 | -1.723015 | 2.022200 | -0.159741 | -1.036646 | 3.139599 | -1.063112 | -1.731836 | -1.107227 | 0.200015 | -0.894925 | -0.352849 | 0.157481 | -0.706905 | -0.281706 | 0.007065 | -0.581621 | -0.564977 | -0.875464 | -0.356954 | -1.355512 | -0.859285 | -0.942717 | -1.089183 | -0.909992 | -0.788635 | -1.050811 | 0.616379 | 1.123958 | 0.275542 | 0.000685 | -0.277191 | 0.502675 | 1.918059 | 1.892380 | 2.123650 | -1.659964 | 2.030455 | -0.775731 | 0.968992 | -0.908451 | -1.274443 | -1.325999 | -0.612749 | 1.072600 | -0.754863 | -1.488832 | -0.102442 | -0.640539 | -1.740243 | -1.796184 | -1.754622 | -0.343064 | -0.532750 | -2.129447 | -0.704780 | 3.148029 | -0.386412 | -0.526733 | -0.304053 | -0.338388 | 0.585245 | 4.540421 | -0.172305 | 2.617119 | 0.434499 | -0.045494 | -0.212142 | 0.296491 | 0.812956 | 0.495469 | 0.443729 | 1.153321 | 1.732163 | -1.786032 | -1.587029 | -0.638419 | 0.875330 | -0.504528 | -0.818373 | 1.707675 | -0.444059 | -0.523886 | -0.447394 | -0.299824 | -2.568101 | -0.674916 | -0.693060 | -0.349169 | 0.643535 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.480802 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | 0.825107 | -0.790556 | -0.787589 | -0.984855 | -0.429036 | 0.214156 | 1.202957 | -0.435910 | -0.040269 | -0.265066 | 0.312924 | 0.027894 | -0.155159 | 0.340708 | -0.211193 | -0.385930 | 0.973401 | 0.339802 | -0.558663 | 0.084432 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.484057 | -1.378501 | 0.351005 | 0.853165 |
| 921 | 0.292738 | -0.434262 | 1.072062 | 0.741784 | 0.625757 | 0.671227 | 0.825040 | 1.036845 | 0.675418 | 0.211673 | 0.566650 | 0.759986 | 0.814770 | 0.816954 | 0.476718 | 0.312447 | 1.283588 | 1.234851 | 0.788347 | 0.080222 | 0.088042 | -0.346673 | 0.074959 | -0.217521 | 0.282582 | -0.643078 | -0.910536 | 1.280992 | 0.227825 | -0.781189 | 0.614512 | 0.848564 | 0.709720 | 0.831727 | 1.233370 | 0.995932 | -0.499276 | -0.101049 | -0.438342 | -0.311962 | 0.043695 | -0.029610 | -0.531976 | -0.777542 | 0.086109 | 0.454281 | -0.496675 | 0.852328 | 0.065922 | -0.060778 | 0.086157 | 0.164981 | 0.349999 | 0.960072 | 0.835855 | 1.376237 | -0.427176 | -0.360073 | -0.590769 | -0.398565 | -0.564977 | -0.199532 | -0.356764 | -0.461651 | -0.460951 | -0.261538 | -0.623374 | -0.104677 | -0.442607 | -0.390977 | 1.620289 | -0.851569 | 1.126315 | -0.989840 | -0.761142 | -0.543080 | 0.453873 | 1.705720 | 1.482873 | -1.279789 | 1.765168 | -1.353010 | 2.293500 | -0.792857 | -0.865635 | -0.702514 | -0.481056 | 0.479503 | 0.439295 | 0.057746 | -0.105930 | -0.354115 | 0.409283 | 0.125725 | 0.227759 | -0.343064 | -1.175885 | 1.208728 | -0.704780 | -0.353318 | 0.495369 | -0.179093 | -0.304053 | -0.338388 | -0.092912 | -0.379747 | -0.172305 | -0.124541 | -0.228307 | -0.512947 | -0.212142 | -0.599622 | 0.581007 | 0.035580 | 1.082245 | -0.238004 | 1.072814 | 0.938116 | 0.573782 | 0.474107 | -0.133128 | -0.506398 | -0.526381 | -0.234971 | -0.519621 | -0.499869 | -0.447394 | -0.299824 | 0.153655 | -0.837037 | -0.715651 | -0.402808 | -0.347119 | -0.464583 | 0.172727 | -0.406460 | -0.351213 | -0.288095 | -0.574816 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.790556 | -0.787589 | -0.585490 | 0.076210 | -0.392713 | -0.074123 | 0.118853 | -0.248215 | 0.426960 | -0.173645 | 0.077570 | 0.526518 | -0.187158 | -0.068680 | 0.089658 | 0.642597 | -0.240095 | -0.558663 | -0.322573 | 0.57735 | -0.324884 | -0.396454 | -1.160948 | -0.640143 | 0.109839 | -1.105747 | -1.172106 |
y_train.head()
1262 0 13 0 451 1 213 1 921 0 Name: n_RFID, dtype: int64
X_test = evci_test.drop('n_RFID', axis = 1)
y_test = evci_test['n_RFID']
X_test.head()
| LS_1 | PC2 | PC4 | PC5 | PC6 | PC7 | PC13 | PC14 | PC17 | PC8 | PC9 | PC10 | PC11 | PC12 | PC18 | PC22 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | PC38 | PC39 | PC40 | PC41 | PC42 | PC43 | PC44 | PC3 | N_14 | N_15 | N_3 | N_16 | N_18 | N_19 | N_20 | N_21 | N_25 | N_26 | N_27 | N_33 | N_34 | N_35 | N_36 | N_24 | N_31 | N_37 | N_38 | N_39 | N_40 | N_41 | N_42 | N_43 | N_44 | N_45 | N_46 | N_47 | N_48 | N_49 | N_50 | N_51 | N_32 | N_29 | N_1 | N_62 | N_63 | N_30 | N_64 | N_5 | N_6 | N_7 | N_8 | N_9 | N_10 | N_11 | N_52 | N_53 | N_54 | N_13 | N_55 | N_56 | N_57 | N_58 | N_59 | N_60 | N_61 | LC_1 | LC_2 | LC_4 | LC_5 | LC_6 | LC_7 | LC_8 | LC_11 | LC_13 | LC_15 | LC_16 | LC_17 | LC_21 | LC_22 | LC_24 | LC_25 | L_1 | L_2 | L_3 | L_4 | L_5 | L_6 | EC_1 | EC_2 | EC_4 | EC_7 | EC_8 | EC_9 | EC_10 | EC_11 | NERST_nr_car | NERST_nr_mw | NERST_nr_truck | road_density | TRDENS_nr_car | TRDENS_nr_mw | TRDENS_nr_truck | n.accomodation | n.culture | n.education | n.entertainment | n.family | n.fashion | n.food | n.health | n.hobby | n.household | n.money | n.public | n.sport | n.transportation | n.work | min_dist.accomodation | min_dist.culture | min_dist.education | min_dist.entertainment | min_dist.family | min_dist.fashion | min_dist.food | min_dist.health | min_dist.hobby | min_dist.household | min_dist.money | min_dist.public | min_dist.sport | min_dist.transportation | min_dist.work | n_of_nn_chst | min_dist_chst | RoadType_residential | RoadType_secondary | RoadType_tertiary | npoint | max_power | lat | lon | CP_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 474 | 1.385493 | -2.399508 | -0.346248 | -0.194383 | -0.359774 | -1.222376 | 0.759633 | 0.865120 | -0.843432 | 0.872437 | -0.104773 | 0.083361 | -0.505148 | -1.580653 | 0.931676 | 1.059036 | -0.378286 | -0.556895 | 0.042534 | 1.313890 | 0.282277 | 0.704202 | -1.537003 | -2.397418 | -0.512201 | -1.014657 | 2.392464 | -0.511503 | -0.080719 | 1.890325 | -0.257700 | -0.475179 | 0.833635 | 0.747846 | 0.451576 | 0.355114 | -0.507277 | -0.106579 | 1.581523 | 0.316702 | -0.575528 | 0.282661 | -0.091944 | 2.153073 | -1.127246 | -1.332576 | 1.335690 | -1.769028 | -0.939916 | 1.155862 | 1.259505 | 0.044393 | -0.203563 | -1.605337 | -0.701638 | 0.747716 | 3.524989 | 2.035930 | 2.632563 | 4.774262 | 0.201430 | 3.785533 | -0.356954 | -0.277704 | 0.780637 | 0.761263 | -0.225815 | 0.070559 | 0.164488 | 0.318992 | -1.179226 | 2.030119 | -1.649110 | 1.718603 | 0.439031 | 0.697295 | -0.370397 | -0.725969 | -0.883958 | 0.271890 | -1.000673 | 1.165478 | -1.262454 | 2.532117 | 0.733908 | 0.965560 | 1.506331 | -0.360430 | -0.609078 | -0.295753 | -0.121081 | -0.103373 | -0.389231 | -0.311819 | -0.405792 | 1.142598 | 0.016514 | -0.378581 | -0.562776 | 0.870442 | 0.136265 | 1.726332 | -0.304053 | 0.013333 | -0.709919 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.334878 | -0.212142 | -0.334733 | -3.536731 | 0.855640 | -2.124485 | -0.481144 | -2.161406 | -3.096809 | -1.623904 | -1.063760 | -0.539568 | -0.881627 | -0.873059 | 0.125790 | 1.729959 | -0.283710 | -0.384869 | -0.299824 | 2.591719 | 0.182261 | -0.414179 | -0.218632 | -0.347119 | -0.464583 | -0.540650 | -0.524991 | -0.351213 | -0.288095 | -0.386788 | -0.572551 | 0.777398 | -0.320211 | 0.63724 | -0.588072 | 0.089748 | -0.790556 | -0.787589 | -0.752716 | -0.357531 | -0.133646 | 0.169502 | -0.691148 | -0.486591 | 0.142687 | -0.192833 | -0.514057 | -0.547208 | -0.419477 | -0.218916 | -0.422628 | 0.352248 | -0.221062 | 2.577259 | -0.438672 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.405080 | 0.647056 | -0.010754 | -1.172106 |
| 689 | 0.132886 | -0.041213 | -0.392623 | -0.097146 | -0.119379 | 0.143255 | 0.128098 | 0.001848 | 0.714554 | -0.094832 | 0.123772 | 0.024586 | -0.026420 | -0.123013 | -0.236104 | -0.141902 | -0.115631 | -0.039409 | -0.043573 | 0.119606 | 0.282277 | -0.083954 | -0.731022 | -0.217521 | -0.512201 | 1.214812 | 0.105772 | -0.909835 | -0.697807 | -0.202242 | 0.108104 | -0.060378 | 0.035188 | 0.020613 | 0.239545 | 0.617957 | -0.320450 | -0.106579 | 0.533363 | -0.087843 | -0.264038 | 0.219948 | -0.350678 | 0.107037 | -1.273507 | 1.233254 | -0.499338 | -0.100931 | 1.238093 | -0.034620 | -0.357882 | 2.113637 | 0.913580 | -0.484079 | -0.646400 | 0.187132 | 1.485733 | 1.771568 | -0.033457 | -0.076353 | 2.984021 | 0.739015 | -0.356954 | -0.301246 | -0.414598 | -0.395858 | -0.809200 | -0.507789 | -0.568990 | -0.550978 | -1.127138 | 1.628552 | -1.499576 | 2.339874 | -0.704773 | -0.389973 | -0.333866 | -1.042833 | -1.075091 | 0.868528 | -1.430041 | 1.050378 | -1.260567 | 0.690657 | 1.014206 | 0.026863 | 0.410587 | -1.143206 | 1.361387 | 0.009016 | -0.115838 | -0.225619 | -0.505027 | 0.159478 | 0.054719 | -0.343064 | 4.080203 | -0.703647 | -0.509289 | -0.353318 | -0.422533 | 0.294709 | -0.304053 | -0.338388 | 2.030185 | 0.706421 | -0.172305 | -0.504475 | -0.228307 | -0.203674 | -0.212142 | -1.760677 | -1.562642 | -0.604648 | -0.487629 | -0.960498 | -2.227927 | -0.316296 | -0.027328 | -0.194021 | 0.061774 | -0.061894 | -0.397579 | -0.036544 | -0.139982 | -0.262695 | -0.259821 | -0.299824 | 0.348573 | 2.950707 | 4.952980 | 6.620105 | 0.643535 | -0.464583 | -0.540650 | -0.406460 | -0.351213 | -0.288095 | -0.574816 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.787589 | -1.131845 | -0.028598 | -0.041061 | 0.018786 | -0.580485 | -0.403013 | 0.510581 | -0.110971 | 1.226111 | -0.329402 | 0.063737 | 1.295281 | -0.304193 | -0.731759 | 0.282867 | 0.068521 | -0.291365 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | 0.579609 | -0.481419 | -0.382129 | 0.853165 |
| 762 | 0.274165 | -0.434262 | 0.672099 | 0.373530 | 0.068740 | -0.108572 | -0.144142 | -0.272081 | -0.608567 | -0.279078 | 1.033732 | 0.458187 | -0.189864 | -0.301142 | -0.106470 | -0.132561 | 0.978449 | 0.280274 | -0.487172 | -0.278404 | 0.864983 | -1.134829 | -0.731022 | -0.217521 | -0.512201 | -0.643078 | -0.656459 | 1.679324 | -0.389263 | -0.868952 | 0.016581 | 0.069309 | -0.152959 | -0.271381 | -0.060541 | -0.367069 | -0.473314 | -0.106579 | -0.441860 | 0.068699 | -0.580982 | 0.652307 | 0.060867 | -0.236880 | 1.013560 | -1.214081 | -0.162654 | 0.755373 | -1.260794 | 0.498659 | 0.190771 | -0.628252 | -0.808637 | 0.300692 | 1.551343 | -0.643267 | -0.318371 | -0.552017 | -0.357067 | -0.311645 | -0.311220 | -0.087531 | -0.356954 | -0.590167 | -0.514682 | 0.009634 | -0.666845 | -0.241531 | -0.030480 | -0.361393 | -0.153401 | -1.334129 | 1.310526 | -0.628830 | -0.992723 | -0.780449 | -0.881216 | -0.075033 | -0.445567 | -0.715210 | 0.239950 | -1.362829 | 0.574208 | -0.613176 | 0.311412 | 0.677485 | -0.117287 | 0.724664 | -0.360832 | 0.489167 | -0.106981 | -0.179128 | 0.800638 | 0.770560 | 0.446160 | -0.343064 | -0.026166 | 1.318029 | -0.704780 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | 0.751202 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.069956 | -0.212142 | -0.175023 | 0.688715 | 0.022506 | 0.240039 | 0.150951 | -0.007030 | 0.070602 | 0.800065 | 0.777829 | -0.377377 | -0.728021 | -0.576353 | -0.391534 | -0.657331 | -0.472849 | -0.384869 | -0.299824 | 0.912240 | 0.113386 | -0.317989 | -0.198978 | -0.347119 | -0.464583 | 1.599481 | -0.169397 | -0.351213 | -0.288095 | -0.386788 | -0.572551 | -0.541194 | -0.320211 | -0.63724 | -0.588072 | -0.645610 | -0.432462 | -0.787589 | 0.883417 | -0.299866 | -0.549252 | -0.489816 | -0.668062 | -0.427217 | -0.620804 | -0.210990 | -0.281822 | -0.492968 | -0.258918 | -0.287242 | -0.197771 | 0.112170 | -0.114927 | -0.558663 | -0.085336 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.282452 | 0.879671 | 1.253738 | 0.853165 |
| 643 | 1.868870 | -1.220360 | -0.078303 | 0.203117 | 0.319325 | -0.037439 | 1.187290 | 0.486956 | -0.114936 | -0.214240 | 0.146557 | 0.795869 | 0.187726 | -0.256279 | 0.748040 | 0.353922 | 0.280693 | 0.715206 | 0.126977 | 0.330898 | 0.670748 | 0.178765 | -1.134013 | -0.217521 | -0.512201 | -0.828868 | 0.359849 | 0.484328 | 0.536368 | 0.861198 | -0.491152 | 0.322308 | 0.608754 | 1.002160 | 1.115269 | 0.756467 | -0.474619 | -0.106579 | -0.441860 | 0.386371 | -0.580982 | 1.324373 | -0.092421 | -0.437297 | 1.180614 | -0.827805 | -0.167285 | 0.264858 | -0.649357 | 0.271094 | -0.264450 | 0.303831 | -0.532646 | -0.496096 | 0.896299 | 0.029815 | 0.052766 | -0.168600 | 0.583411 | -0.128560 | 0.194773 | 0.115613 | -0.356954 | -0.544723 | -0.414192 | 1.161450 | 0.583416 | 0.776308 | 0.256924 | 0.259242 | 0.029305 | -0.571955 | 0.552790 | 0.061111 | -0.742850 | -0.832163 | -0.720504 | 0.677571 | 0.446384 | -1.077100 | 1.096281 | -1.036073 | 1.098041 | -0.622521 | 1.012636 | 1.249635 | -0.136817 | 0.944867 | 0.687185 | 1.245970 | -0.107757 | -0.238476 | 1.430515 | 1.475673 | 1.421358 | -0.343064 | -0.227459 | 1.601398 | -0.455591 | -0.353318 | -0.580070 | -0.526733 | -0.304053 | -0.338388 | -0.709919 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.102065 | -0.212142 | 1.365701 | 0.083473 | 0.042540 | -0.200460 | 0.262544 | 0.476049 | -0.497447 | 1.295388 | 0.232059 | -0.404588 | -0.706794 | -0.567830 | -0.397305 | -0.609971 | -0.493864 | -0.447394 | -0.299824 | 2.265221 | -0.520443 | -0.592368 | -0.375263 | -0.347119 | -0.464583 | 0.886104 | -0.406460 | -0.351213 | -0.088614 | -0.386788 | 0.665133 | 0.118102 | 0.251554 | -0.63724 | -0.588072 | 0.089748 | -0.790556 | -0.110609 | 1.322266 | -0.325619 | -0.442310 | 0.046489 | -0.718301 | -0.506466 | 0.143444 | -0.485155 | -0.454565 | -0.580648 | -0.381361 | 2.222107 | -0.405291 | 0.110645 | -0.279268 | -0.558663 | -0.549434 | 0.57735 | -0.324884 | -0.396454 | 0.507401 | -3.185924 | 0.247814 | 0.236190 | -1.172106 |
| 879 | 0.622219 | -1.220360 | -0.333556 | -0.382118 | -0.428163 | -1.013656 | 0.311666 | 0.398070 | -0.602314 | 0.146683 | -0.579369 | -0.082761 | -0.211206 | -0.990895 | 0.274536 | 0.014553 | -0.690715 | -0.283854 | 0.181804 | 0.277059 | -0.106193 | 0.441483 | -0.328032 | -0.217521 | -0.512201 | -0.457289 | 1.884310 | -1.109001 | 0.227825 | 1.903725 | -0.007359 | -0.429727 | 0.406385 | 0.698645 | 0.043006 | -0.273587 | -0.425612 | -0.106579 | -0.441860 | 0.335682 | -0.580982 | 0.001828 | -0.318994 | 0.911241 | -1.135950 | 0.151536 | -0.006044 | -0.152190 | 0.045786 | 0.119548 | 0.608943 | -0.246758 | -0.225640 | 0.060664 | -0.337586 | 1.023680 | 1.736465 | 1.442399 | 0.211964 | 1.143692 | 0.490138 | 2.635818 | -0.356954 | -0.330265 | 0.471469 | 0.688596 | 0.239106 | 0.356206 | -0.155482 | 0.302824 | -0.004716 | 1.122884 | -0.517099 | -0.825543 | 1.969773 | 0.977166 | -0.145083 | 0.317820 | 0.373763 | -1.175636 | 0.253101 | -0.254740 | -0.174544 | -0.385120 | -0.297596 | -0.141505 | -0.240642 | 0.518731 | 0.220608 | 0.177022 | -0.109245 | 0.021891 | -0.510581 | -0.073655 | 0.170581 | -0.343064 | -0.459951 | 0.357647 | 0.113948 | -0.353318 | 0.417687 | -0.201344 | 2.404405 | -0.337093 | 1.118426 | -0.379747 | -0.172305 | -0.504475 | -0.228307 | -0.358149 | -0.212142 | 0.026202 | -1.531519 | 0.301673 | -0.368334 | 0.234265 | -0.294029 | -0.249076 | -0.079489 | -0.495554 | -0.246980 | 0.265076 | 0.245991 | 0.225630 | 1.479221 | -0.487860 | -0.447394 | -0.299824 | 0.068677 | 0.103356 | 0.128147 | -0.201832 | -0.347119 | -0.464583 | -0.540650 | 0.304728 | 1.517018 | 0.576321 | 0.459334 | 0.665133 | 0.118102 | 0.823319 | 0.63724 | -0.588072 | -0.645610 | 2.432290 | -0.110609 | -0.338805 | -0.369323 | -0.389935 | -0.076465 | -0.754845 | -0.522836 | -0.019840 | -0.379604 | -0.483652 | -0.578653 | -0.416227 | 0.035632 | -0.214148 | -0.314149 | -0.336948 | 1.322890 | -0.483763 | 0.57735 | -0.324884 | -0.396454 | -1.160948 | -0.649893 | 0.483688 | -0.748047 | -1.172106 |
y_test.head()
474 0 689 1 762 0 643 0 879 0 Name: n_RFID, dtype: int64
# creating a function for gridsearch which can be used various models
# Create a function gridsearch which would be used for fine tuning hyper-parameters
def gridsearch_classifier(estimator, params, score, X_train, y_train, refit_score):
kfolds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 16)
grid_search = GridSearchCV(estimator, params, scoring = score, refit = refit_score, cv = kfolds, return_train_score = True, n_jobs = - 1, verbose=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
cv_results = grid_search.cv_results_
cv_results = pd.DataFrame(cv_results)
return cv_results
# Creating a function for metrics, confusion matrix and classification report
def classification_rep(y_train, X_train, model):
probs = np.array([x[-1] for x in model.predict_proba(X_train)])
print('The roc_auc score is {}'.format(roc_auc_score(y_train, probs).round(4)))
print('confusion matrix:')
print(confusion_matrix(y_train, model.predict(X_train)))
print('classification report:')
print(classification_report(y_train, model.predict(X_train)))
# Creating a function metrics_curve for drawing ROC_curve and
def metrics_curves(y_train, X_train, model, graphs = True):
probs = np.array([x[-1] for x in model.predict_proba(X_train)])
fpr, tpr, thresholds = roc_curve( y_train, probs)
auc_score = roc_auc_score( y_train, probs )
if graphs == True:
plt.figure(figsize=(8, 8))
plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.legend(loc="lower right")
plt.show()
p, r, thresholds = precision_recall_curve(y_train, probs)
pre_recal = pd.DataFrame({'precision':p[:-1], 'recall':r[:-1], 'thresholds': thresholds})
pre_recal.plot.line(x='thresholds', y=['precision','recall'], figsize = (8,8))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Precision vs Recall Trade-Off Curve')
plt.show()
thresh = [i/100 for i in range(0,105,5)]
results_df = pd.DataFrame( columns = ['thresh','accuracy','prec', 'recal', 'f1score'])
for i in thresh:
pred = [1 if t > i else 0 for t in probs]
pr = metrics.precision_score(y_train, pred)
rc = metrics.recall_score(y_train, pred)
ac = metrics.accuracy_score(y_train, pred)
f1 = metrics.f1_score(y_train, pred)
results_df.loc[i] = [i, ac, pr, rc, f1]
return results_df, fpr, tpr
# creating model_Results dataframe which would be used for comparing models
model_results = pd.DataFrame({'Model':[], 'Description':[] , 'Train_auc_mean': [], 'cv_auc_mean':[], 'cv_auc_std' :[], 'test_auc':[]})
model_results
| Model | Description | Train_auc_mean | cv_auc_mean | cv_auc_std | test_auc |
|---|
# Creating logistic model with class_weight = balanced
log_m = LogisticRegression(random_state = 0, n_jobs = -1, class_weight= 'balanced')
log_m.fit(X_train, y_train)
# identifying metrics for the train dataset
classification_rep(y_train, X_train, log_m)
# results of test dataset
print('\n Predicting for the test dataset \n')
classification_rep(y_test, X_test, log_m)
results, fpr, tpr = metrics_curves(y_test, X_test, log_m)
The roc_auc score is 0.9557
confusion matrix:
[[639 88]
[ 28 217]]
classification report:
precision recall f1-score support
0 0.96 0.88 0.92 727
1 0.71 0.89 0.79 245
accuracy 0.88 972
macro avg 0.83 0.88 0.85 972
weighted avg 0.90 0.88 0.88 972
Predicting for the test dataset
The roc_auc score is 0.8804
confusion matrix:
[[158 32]
[ 17 48]]
classification report:
precision recall f1-score support
0 0.90 0.83 0.87 190
1 0.60 0.74 0.66 65
accuracy 0.81 255
macro avg 0.75 0.79 0.76 255
weighted avg 0.83 0.81 0.81 255
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
##- Hyper-parametr tuning using gridsearch- penalty (regularization) and C - regularization
lr_m = LogisticRegression(random_state = 0, n_jobs = -1)
#param grid for gridsearch
params = {'solver': ['newton-cg', 'lbfgs', 'saga', 'liblinear'], 'penalty' : ['l1', 'l2', 'elasticnet'], 'C':[ 0.05, 0.1, 0.2, 0.22, 0.25, 0.28, 0.3, 0.4, 0.5, 0.8, 1, 10, 100], 'class_weight': ['balanced', None, {0:1, 1:3}]}
# using the gridsearch function created for grid search and tuning hyper parameter
cv_results = gridsearch_classifier(lr_m, params, ['recall', 'accuracy', 'precision', 'roc_auc', 'f1'], X_train, y_train, 'f1')
cv_results[['param_C', 'param_solver', 'param_penalty', 'param_class_weight', 'mean_test_recall', 'mean_test_accuracy', 'mean_test_precision', 'mean_test_roc_auc', 'std_test_roc_auc','mean_test_f1']].sort_values(by = 'mean_test_roc_auc', ascending = False).head(10)
Fitting 5 folds for each of 468 candidates, totalling 2340 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 117 tasks | elapsed: 1.9s [Parallel(n_jobs=-1)]: Done 1216 tasks | elapsed: 5.3s [Parallel(n_jobs=-1)]: Done 2340 out of 2340 | elapsed: 20.5s finished
{'C': 0.05, 'class_weight': {0: 1, 1: 3}, 'penalty': 'l1', 'solver': 'saga'}
| param_C | param_solver | param_penalty | param_class_weight | mean_test_recall | mean_test_accuracy | mean_test_precision | mean_test_roc_auc | std_test_roc_auc | mean_test_f1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 122 | 0.22 | saga | l1 | None | 0.579592 | 0.841570 | 0.736970 | 0.879834 | 0.014117 | 0.647083 |
| 86 | 0.2 | saga | l1 | None | 0.579592 | 0.840539 | 0.733433 | 0.879750 | 0.013952 | 0.645634 |
| 87 | 0.2 | liblinear | l1 | None | 0.583673 | 0.841560 | 0.736030 | 0.879669 | 0.014968 | 0.649677 |
| 158 | 0.25 | saga | l1 | None | 0.575510 | 0.842596 | 0.744087 | 0.879327 | 0.014825 | 0.646991 |
| 19 | 0.05 | liblinear | l2 | None | 0.604082 | 0.833317 | 0.695190 | 0.879232 | 0.022680 | 0.644155 |
| 123 | 0.22 | liblinear | l1 | None | 0.575510 | 0.840539 | 0.735890 | 0.879217 | 0.015456 | 0.644105 |
| 159 | 0.25 | liblinear | l1 | None | 0.575510 | 0.839514 | 0.731217 | 0.879103 | 0.015269 | 0.642704 |
| 18 | 0.05 | saga | l2 | None | 0.563265 | 0.837446 | 0.732505 | 0.878845 | 0.019174 | 0.633546 |
| 17 | 0.05 | lbfgs | l2 | None | 0.563265 | 0.837446 | 0.732505 | 0.878816 | 0.019283 | 0.633546 |
| 16 | 0.05 | newton-cg | l2 | None | 0.563265 | 0.837446 | 0.732505 | 0.878816 | 0.019283 | 0.633546 |
# Finalizing the model based on above hyper parameter tuning
log_m_final = LogisticRegression(C = 0.22, penalty = 'l1', random_state = 0, n_jobs = -1, solver = 'saga')
log_m_final.fit(X_train, y_train)
# identifying metrics for the train dataset
classification_rep(y_train, X_train, log_m_final)
# results of test dataset
print('\n Predicting for the test dataset \n')
classification_rep(y_test, X_test, log_m_final)
results, fpr, tpr = metrics_curves(y_test, X_test, log_m_final)
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:330: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge "the coef_ did not converge", ConvergenceWarning)
The roc_auc score is 0.9294
confusion matrix:
[[692 35]
[ 84 161]]
classification report:
precision recall f1-score support
0 0.89 0.95 0.92 727
1 0.82 0.66 0.73 245
accuracy 0.88 972
macro avg 0.86 0.80 0.83 972
weighted avg 0.87 0.88 0.87 972
Predicting for the test dataset
The roc_auc score is 0.9077
confusion matrix:
[[180 10]
[ 26 39]]
classification report:
precision recall f1-score support
0 0.87 0.95 0.91 190
1 0.80 0.60 0.68 65
accuracy 0.86 255
macro avg 0.83 0.77 0.80 255
weighted avg 0.85 0.86 0.85 255
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
model_results.loc[0] = ['log_m_final', 'logistic regression with l1 penalty', 0.9294, 0.8798, 0.014117, 0.9077]
model_results
| Model | Description | Train_auc_mean | cv_auc_mean | cv_auc_std | test_auc | |
|---|---|---|---|---|---|---|
| 0 | log_m_final | logistic regression with l1 penalty | 0.9294 | 0.8798 | 0.014117 | 0.9077 |
# using a base model with 40 trees
rf_m = RandomForestClassifier(n_estimators = 40, max_depth =10, n_jobs = -1, random_state = 0, oob_score= True, verbose = 1)
rf_m.fit(X_train, y_train)
# identifying metrics for the train dataset
classification_rep(y_train, X_train, rf_m)
# results of test dataset
print('\n Predicting for the test dataset \n')
classification_rep(y_test, X_test, rf_m)
results, fpr, tpr = metrics_curves(y_test, X_test, rf_m)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=-1)]: Done 40 out of 40 | elapsed: 0.0s finished [Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished
The roc_auc score is 1.0 confusion matrix: [[727 0] [ 12 233]] classification report:
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished [Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished
precision recall f1-score support
0 0.98 1.00 0.99 727
1 1.00 0.95 0.97 245
accuracy 0.99 972
macro avg 0.99 0.98 0.98 972
weighted avg 0.99 0.99 0.99 972
Predicting for the test dataset
The roc_auc score is 0.8336
confusion matrix:
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished [Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished
[[180 10]
[ 40 25]]
classification report:
precision recall f1-score support
0 0.82 0.95 0.88 190
1 0.71 0.38 0.50 65
accuracy 0.80 255
macro avg 0.77 0.67 0.69 255
weighted avg 0.79 0.80 0.78 255
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished [Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=24)]: Done 34 out of 40 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=24)]: Done 40 out of 40 | elapsed: 0.0s finished
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# Tuning hyper parameters - Tuning n_estimators, max_depth, min_samples split and max_features
rf_m = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight= 'balanced', verbose=1, oob_score = True)
params_rf = {'min_samples_split' : [2, 5,10,20], 'n_estimators' : range(20,200,20), 'max_depth' : [5,10,12], 'max_features' : ['auto', 0.2, 0.5, 0.8]}
# using the gridsearch for identifying best parameters
cv_results = gridsearch_classifier(rf_m, params_rf, ['recall', 'accuracy', 'precision', 'roc_auc', 'f1'], X_train, y_train, 'roc_auc')
cv_results.loc[cv_results['rank_test_roc_auc'] == 1]
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 4.1s [Parallel(n_jobs=-1)]: Done 317 tasks | elapsed: 12.0s [Parallel(n_jobs=-1)]: Done 600 tasks | elapsed: 32.9s [Parallel(n_jobs=-1)]: Done 965 tasks | elapsed: 55.2s [Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 1.7min [Parallel(n_jobs=-1)]: Done 1937 tasks | elapsed: 2.2min [Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 2.7min finished [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.0s
{'max_depth': 12, 'max_features': 0.8, 'min_samples_split': 10, 'n_estimators': 140}
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed: 0.2s finished
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_max_features | param_min_samples_split | param_n_estimators | params | split0_test_recall | split1_test_recall | split2_test_recall | split3_test_recall | split4_test_recall | mean_test_recall | std_test_recall | rank_test_recall | split0_train_recall | split1_train_recall | split2_train_recall | split3_train_recall | split4_train_recall | mean_train_recall | std_train_recall | split0_test_accuracy | split1_test_accuracy | split2_test_accuracy | split3_test_accuracy | split4_test_accuracy | mean_test_accuracy | std_test_accuracy | rank_test_accuracy | split0_train_accuracy | split1_train_accuracy | split2_train_accuracy | split3_train_accuracy | split4_train_accuracy | mean_train_accuracy | std_train_accuracy | split0_test_precision | split1_test_precision | split2_test_precision | split3_test_precision | split4_test_precision | mean_test_precision | std_test_precision | rank_test_precision | split0_train_precision | split1_train_precision | split2_train_precision | split3_train_precision | split4_train_precision | mean_train_precision | std_train_precision | split0_test_roc_auc | split1_test_roc_auc | split2_test_roc_auc | split3_test_roc_auc | split4_test_roc_auc | mean_test_roc_auc | std_test_roc_auc | rank_test_roc_auc | split0_train_roc_auc | split1_train_roc_auc | split2_train_roc_auc | split3_train_roc_auc | split4_train_roc_auc | mean_train_roc_auc | std_train_roc_auc | split0_test_f1 | split1_test_f1 | split2_test_f1 | split3_test_f1 | split4_test_f1 | mean_test_f1 | std_test_f1 | rank_test_f1 | split0_train_f1 | split1_train_f1 | split2_train_f1 | split3_train_f1 | split4_train_f1 | mean_train_f1 | std_train_f1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 420 | 1.597735 | 0.302008 | 2.340952 | 0.499055 | 12 | 0.8 | 10 | 140 | {'max_depth': 12, 'max_features': 0.8, 'min_sa... | 0.326531 | 0.489796 | 0.530612 | 0.530612 | 0.510204 | 0.477551 | 0.077012 | 259 | 0.994898 | 1.0 | 1.0 | 1.0 | 1.0 | 0.99898 | 0.002041 | 0.764103 | 0.835897 | 0.845361 | 0.835052 | 0.819588 | 0.82 | 0.029145 | 85 | 0.994852 | 0.997426 | 0.997429 | 0.994859 | 0.994859 | 0.995885 | 0.00126 | 0.551724 | 0.774194 | 0.787879 | 0.742857 | 0.694444 | 0.71022 | 0.085508 | 118 | 0.984848 | 0.989899 | 0.989899 | 0.98 | 0.98 | 0.984929 | 0.004427 | 0.814929 | 0.887755 | 0.888529 | 0.87361 | 0.8867 | 0.870305 | 0.028224 | 1 | 0.999833 | 0.999824 | 0.999912 | 0.999869 | 0.999807 | 0.999849 | 0.000037 | 0.410256 | 0.6 | 0.634146 | 0.619048 | 0.588235 | 0.570337 | 0.081573 | 252 | 0.989848 | 0.994924 | 0.994924 | 0.989899 | 0.989899 | 0.991899 | 0.00247 |
# further tuning n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features
rf_m = RandomForestClassifier(random_state = 0, n_jobs = -1, class_weight= 'balanced', verbose=1, oob_score = True)
params_rf = {'min_samples_split' : range(2,50,20),'min_samples_leaf' : range(2,50,20), 'n_estimators' : range(100,200,20), 'max_depth' : [12,14,16], 'max_features' : ['auto', 0.2, 0.5, 0.8]}
# using the gridsearch for identifying best parameters
cv_results = gridsearch_classifier(rf_m, params_rf, ['recall', 'accuracy', 'precision', 'roc_auc', 'f1'], X_train, y_train, 'roc_auc')
cv_results.loc[cv_results['rank_test_roc_auc'] == 1]
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 5.4s [Parallel(n_jobs=-1)]: Done 317 tasks | elapsed: 16.0s [Parallel(n_jobs=-1)]: Done 600 tasks | elapsed: 38.9s [Parallel(n_jobs=-1)]: Done 965 tasks | elapsed: 1.3min [Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 1.8min [Parallel(n_jobs=-1)]: Done 1937 tasks | elapsed: 2.6min [Parallel(n_jobs=-1)]: Done 2544 tasks | elapsed: 3.5min [Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed: 3.8min finished [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.0s [Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 0.0s finished
{'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_max_features | param_min_samples_leaf | param_min_samples_split | param_n_estimators | params | split0_test_recall | split1_test_recall | split2_test_recall | split3_test_recall | split4_test_recall | mean_test_recall | std_test_recall | rank_test_recall | split0_train_recall | split1_train_recall | split2_train_recall | split3_train_recall | split4_train_recall | mean_train_recall | std_train_recall | split0_test_accuracy | split1_test_accuracy | split2_test_accuracy | split3_test_accuracy | split4_test_accuracy | mean_test_accuracy | std_test_accuracy | rank_test_accuracy | split0_train_accuracy | split1_train_accuracy | split2_train_accuracy | split3_train_accuracy | split4_train_accuracy | mean_train_accuracy | std_train_accuracy | split0_test_precision | split1_test_precision | split2_test_precision | split3_test_precision | split4_test_precision | mean_test_precision | std_test_precision | rank_test_precision | split0_train_precision | split1_train_precision | split2_train_precision | split3_train_precision | split4_train_precision | mean_train_precision | std_train_precision | split0_test_roc_auc | split1_test_roc_auc | split2_test_roc_auc | split3_test_roc_auc | split4_test_roc_auc | mean_test_roc_auc | std_test_roc_auc | rank_test_roc_auc | split0_train_roc_auc | split1_train_roc_auc | split2_train_roc_auc | split3_train_roc_auc | split4_train_roc_auc | mean_train_roc_auc | std_train_roc_auc | split0_test_f1 | split1_test_f1 | split2_test_f1 | split3_test_f1 | split4_test_f1 | mean_test_f1 | std_test_f1 | rank_test_f1 | split0_train_f1 | split1_train_f1 | split2_train_f1 | split3_train_f1 | split4_train_f1 | mean_train_f1 | std_train_f1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 180 | 0.883663 | 0.332522 | 0.36815 | 0.094474 | 14 | auto | 2 | 2 | 100 | {'max_depth': 14, 'max_features': 'auto', 'min... | 0.265306 | 0.44898 | 0.510204 | 0.489796 | 0.489796 | 0.440816 | 0.089981 | 526 | 0.994898 | 1.0 | 1.0 | 1.0 | 1.0 | 0.99898 | 0.002041 | 0.748718 | 0.835897 | 0.835052 | 0.824742 | 0.829897 | 0.814861 | 0.033313 | 61 | 0.998713 | 1.0 | 1.0 | 0.998715 | 1.0 | 0.999486 | 0.00063 | 0.5 | 0.814815 | 0.757576 | 0.727273 | 0.75 | 0.709933 | 0.108856 | 47 | 1.0 | 1.0 | 1.0 | 0.994924 | 1.0 | 0.998985 | 0.00203 | 0.811714 | 0.900475 | 0.890218 | 0.871921 | 0.877833 | 0.870432 | 0.03098 | 1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 4.965068e-17 | 0.346667 | 0.578947 | 0.609756 | 0.585366 | 0.592593 | 0.542666 | 0.098538 | 526 | 0.997442 | 1.0 | 1.0 | 0.997455 | 1.0 | 0.99898 | 0.00125 |
# further tuning n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features
rf_m = RandomForestClassifier(random_state = 0, n_jobs = -1, verbose=1, oob_score = True)
params_rf = {'class_weight': ['balanced', None], 'min_samples_split' : [2,5,10],'min_samples_leaf' : [2,5,10], 'n_estimators' : range(80,120,10), 'max_depth' : [13,14,15], 'max_features' : ['auto', 0.2, 0.5, 0.8]}
# using the gridsearch for identifying best parameters
cv_results = gridsearch_classifier(rf_m, params_rf, ['recall', 'accuracy', 'precision', 'roc_auc', 'f1'], X_train, y_train, 'roc_auc')
cv_results.loc[cv_results['rank_test_roc_auc'] == 1]
Fitting 5 folds for each of 864 candidates, totalling 4320 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 6.6s [Parallel(n_jobs=-1)]: Done 317 tasks | elapsed: 15.4s [Parallel(n_jobs=-1)]: Done 600 tasks | elapsed: 41.7s [Parallel(n_jobs=-1)]: Done 965 tasks | elapsed: 1.1min [Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 1.7min [Parallel(n_jobs=-1)]: Done 1937 tasks | elapsed: 2.2min [Parallel(n_jobs=-1)]: Done 2544 tasks | elapsed: 3.0min [Parallel(n_jobs=-1)]: Done 3233 tasks | elapsed: 3.9min [Parallel(n_jobs=-1)]: Done 4002 tasks | elapsed: 4.9min [Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 5.6min finished [Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 0.0s [Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 0.0s finished
{'class_weight': 'balanced', 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_class_weight | param_max_depth | param_max_features | param_min_samples_leaf | param_min_samples_split | param_n_estimators | params | split0_test_recall | split1_test_recall | split2_test_recall | split3_test_recall | split4_test_recall | mean_test_recall | std_test_recall | rank_test_recall | split0_train_recall | split1_train_recall | split2_train_recall | split3_train_recall | split4_train_recall | mean_train_recall | std_train_recall | split0_test_accuracy | split1_test_accuracy | split2_test_accuracy | split3_test_accuracy | split4_test_accuracy | mean_test_accuracy | std_test_accuracy | rank_test_accuracy | split0_train_accuracy | split1_train_accuracy | split2_train_accuracy | split3_train_accuracy | split4_train_accuracy | mean_train_accuracy | std_train_accuracy | split0_test_precision | split1_test_precision | split2_test_precision | split3_test_precision | split4_test_precision | mean_test_precision | std_test_precision | rank_test_precision | split0_train_precision | split1_train_precision | split2_train_precision | split3_train_precision | split4_train_precision | mean_train_precision | std_train_precision | split0_test_roc_auc | split1_test_roc_auc | split2_test_roc_auc | split3_test_roc_auc | split4_test_roc_auc | mean_test_roc_auc | std_test_roc_auc | rank_test_roc_auc | split0_train_roc_auc | split1_train_roc_auc | split2_train_roc_auc | split3_train_roc_auc | split4_train_roc_auc | mean_train_roc_auc | std_train_roc_auc | split0_test_f1 | split1_test_f1 | split2_test_f1 | split3_test_f1 | split4_test_f1 | mean_test_f1 | std_test_f1 | rank_test_f1 | split0_train_f1 | split1_train_f1 | split2_train_f1 | split3_train_f1 | split4_train_f1 | mean_train_f1 | std_train_f1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 146 | 0.358715 | 0.022943 | 0.241844 | 0.001667 | balanced | 14 | auto | 2 | 2 | 100 | {'class_weight': 'balanced', 'max_depth': 14, ... | 0.265306 | 0.44898 | 0.510204 | 0.489796 | 0.489796 | 0.440816 | 0.089981 | 464 | 0.994898 | 1.0 | 1.0 | 1.0 | 1.0 | 0.99898 | 0.002041 | 0.748718 | 0.835897 | 0.835052 | 0.824742 | 0.829897 | 0.814861 | 0.033313 | 428 | 0.998713 | 1.0 | 1.0 | 0.998715 | 1.0 | 0.999486 | 0.00063 | 0.5 | 0.814815 | 0.757576 | 0.727273 | 0.75 | 0.709933 | 0.108856 | 466 | 1.0 | 1.0 | 1.0 | 0.994924 | 1.0 | 0.998985 | 0.00203 | 0.811714 | 0.900475 | 0.890218 | 0.871921 | 0.877833 | 0.870432 | 0.03098 | 1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.346667 | 0.578947 | 0.609756 | 0.585366 | 0.592593 | 0.542666 | 0.098538 | 518 | 0.997442 | 1.0 | 1.0 | 0.997455 | 1.0 | 0.99898 | 0.00125 |
# using the finetuned hyper-parameters
# increasing the number of trees based on previous reuslt
rf_m_final = RandomForestClassifier(max_depth= 14, max_features = 'auto', min_samples_split = 2, n_estimators= 100, min_samples_leaf = 2, random_state = 0, n_jobs = -1, class_weight = 'balanced')
rf_m_final.fit(X_train, y_train)
# identifying metrics for the train dataset
classification_rep(y_train, X_train, rf_m_final)
# results of test dataset
print('\n Predicting for the test dataset \n')
classification_rep(y_test, X_test, rf_m_final)
results, fpr, tpr = metrics_curves(y_test, X_test, rf_m_final)
The roc_auc score is 1.0
confusion matrix:
[[725 2]
[ 1 244]]
classification report:
precision recall f1-score support
0 1.00 1.00 1.00 727
1 0.99 1.00 0.99 245
accuracy 1.00 972
macro avg 1.00 1.00 1.00 972
weighted avg 1.00 1.00 1.00 972
Predicting for the test dataset
The roc_auc score is 0.8532
confusion matrix:
[[178 12]
[ 40 25]]
classification report:
precision recall f1-score support
0 0.82 0.94 0.87 190
1 0.68 0.38 0.49 65
accuracy 0.80 255
macro avg 0.75 0.66 0.68 255
weighted avg 0.78 0.80 0.78 255
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
model_results.loc[1] = ['rf_m_final', 'Random Forest', 1.0, 0.870432, 0.03098, 0.8532]
model_results
| Model | Description | Train_auc_mean | cv_auc_mean | cv_auc_std | test_auc | |
|---|---|---|---|---|---|---|
| 0 | log_m_final | logistic regression with l1 penalty | 0.9294 | 0.879800 | 0.014117 | 0.9077 |
| 1 | rf_m_final | Random Forest | 1.0000 | 0.870432 | 0.030980 | 0.8532 |
# defining function gbm_fit for fitting the dataset and cross validation
def gbm_fit(gbm_m, X_train, y_train, req_cv = True, fimp = True, kfolds = 5):
gbm_m.fit(X_train, y_train)
classification_rep(y_train, X_train, gbm_m)
if req_cv:
cv_score = cross_val_score(gbm_m, X_train, y_train, cv = kfolds, n_jobs = -1, verbose =1, scoring = 'roc_auc')
print('cross validation scores:')
print('cv score mean is {}'.format(cv_score.mean()))
print('cv score std is {}'.format(cv_score.std()))
if fimp:
feat_imp = pd.DataFrame({'features': X_train.columns, 'imp':gbm_m.feature_importances_})
feat_imp.plot(kind = 'bar')
plt.show()
return cv_score
# fitting the base model
gbm_m = GradientBoostingClassifier(random_state = 0)
gbm_fit(gbm_m, X_train, y_train, fimp = False)
The roc_auc score is 1.0
confusion matrix:
[[727 0]
[ 10 235]]
classification report:
precision recall f1-score support
0 0.99 1.00 0.99 727
1 1.00 0.96 0.98 245
accuracy 0.99 972
macro avg 0.99 0.98 0.99 972
weighted avg 0.99 0.99 0.99 972
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
cross validation scores: cv score mean is 0.8865217433217973 cv score std is 0.016366148229863618
[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 2.0s finished
array([0.9032709 , 0.87866928, 0.86938776, 0.90893737, 0.87234342])
# Tuning n_estimator parameters for the alpha rate of 0.05
params1 = {'n_estimators':range(200,800,10), 'max_depth': range(2,9,1)}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate =0.05, max_depth=8,
min_samples_split=500, min_samples_leaf=50, subsample=0.8, max_features='sqrt',
random_state=0),
param_grid = params1, scoring='roc_auc',cv=5, n_jobs = -1, return_train_score = True, verbose = 2)
grid_search.fit(X_train,y_train)
final_cv_results = grid_search.cv_results_
final_cv_results = pd.DataFrame(final_cv_results)
final_cv_results[final_cv_results['rank_test_score'] == 1]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
Fitting 5 folds for each of 420 candidates, totalling 2100 fits
[Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 2.0s [Parallel(n_jobs=-1)]: Done 317 tasks | elapsed: 8.0s [Parallel(n_jobs=-1)]: Done 600 tasks | elapsed: 15.5s [Parallel(n_jobs=-1)]: Done 965 tasks | elapsed: 24.3s [Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 35.9s [Parallel(n_jobs=-1)]: Done 1937 tasks | elapsed: 49.4s [Parallel(n_jobs=-1)]: Done 2100 out of 2100 | elapsed: 54.8s finished
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_n_estimators | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 108 | 0.817485 | 0.008803 | 0.005204 | 4.002333e-04 | 3 | 680 | {'max_depth': 3, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
| 168 | 0.810680 | 0.003524 | 0.005404 | 4.903297e-04 | 4 | 680 | {'max_depth': 4, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
| 228 | 0.832231 | 0.017439 | 0.009208 | 3.603005e-03 | 5 | 680 | {'max_depth': 5, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
| 288 | 0.820794 | 0.010238 | 0.005404 | 4.903686e-04 | 6 | 680 | {'max_depth': 6, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
| 348 | 0.845709 | 0.023600 | 0.005805 | 4.003525e-04 | 7 | 680 | {'max_depth': 7, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
| 408 | 0.823703 | 0.008205 | 0.005004 | 1.907349e-07 | 8 | 680 | {'max_depth': 8, 'n_estimators': 680} | 0.910679 | 0.883841 | 0.873329 | 0.915693 | 0.896692 | 0.896047 | 0.01591 | 1 | 0.990314 | 0.988356 | 0.991751 | 0.987447 | 0.988095 | 0.989192 | 0.001597 |
# Tuning n_estimator, max_depth, min_samples_split, min_samples_leaf parameters for the alpha rate of 0.05
params1 = {'n_estimators':[680], 'max_depth': [3], 'min_samples_split': range(200,500,10), 'min_samples_leaf': range(10,200,10)}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate =0.05, max_depth=3,
min_samples_split=500, min_samples_leaf=50, subsample=0.8, max_features='sqrt',
random_state=0),
param_grid = params1, scoring='roc_auc',cv=5, n_jobs = -1, return_train_score = True, verbose = 2)
grid_search.fit(X_train,y_train)
final_cv_results = grid_search.cv_results_
final_cv_results = pd.DataFrame(final_cv_results)
final_cv_results[final_cv_results['rank_test_score'] == 1]
Fitting 5 folds for each of 570 candidates, totalling 2850 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 6.0s [Parallel(n_jobs=-1)]: Done 317 tasks | elapsed: 16.2s [Parallel(n_jobs=-1)]: Done 600 tasks | elapsed: 29.4s [Parallel(n_jobs=-1)]: Done 965 tasks | elapsed: 46.3s [Parallel(n_jobs=-1)]: Done 1410 tasks | elapsed: 1.1min [Parallel(n_jobs=-1)]: Done 1937 tasks | elapsed: 1.4min [Parallel(n_jobs=-1)]: Done 2544 tasks | elapsed: 1.8min [Parallel(n_jobs=-1)]: Done 2850 out of 2850 | elapsed: 2.0min finished
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_min_samples_leaf | param_min_samples_split | param_n_estimators | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 311 | 0.998109 | 0.014331 | 0.006306 | 0.000401 | 3 | 110 | 310 | 680 | {'max_depth': 3, 'min_samples_leaf': 110, 'min... | 0.915851 | 0.892647 | 0.866573 | 0.918508 | 0.902322 | 0.89918 | 0.0188 | 1 | 0.998613 | 0.998577 | 0.99922 | 0.997826 | 0.998948 | 0.998637 | 0.000469 |
# Tuning max_features
params1 = {'max_features': ['auto', 0.2, 0,4, 0.6, 0.8, 'sqrt'], 'subsample': [i/100 for i in range(60,100,5)]}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate =0.05, n_estimators = 680, max_depth = 3,
min_samples_leaf=110, min_samples_split = 310,
random_state=0),
param_grid = params1, scoring='roc_auc',cv=5, n_jobs = -1, return_train_score = True, verbose = 2)
grid_search.fit(X_train,y_train)
final_cv_results = grid_search.cv_results_
final_cv_results = pd.DataFrame(final_cv_results)
final_cv_results[final_cv_results['rank_test_score'] == 1]
Fitting 5 folds for each of 56 candidates, totalling 280 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 15.8s [Parallel(n_jobs=-1)]: Done 233 out of 280 | elapsed: 32.6s remaining: 6.5s [Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed: 43.6s finished
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_features | param_subsample | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 52 | 1.002986 | 0.012569 | 0.006105 | 0.000203 | sqrt | 0.8 | {'max_features': 'sqrt', 'subsample': 0.8} | 0.915851 | 0.892647 | 0.866573 | 0.918508 | 0.902322 | 0.89918 | 0.0188 | 1 | 0.998613 | 0.998577 | 0.99922 | 0.997826 | 0.998948 | 0.998637 | 0.000469 |
# Identifying the n_estimators for alpha rate of 0.05
params1 = {'n_estimators': range(300,700,10)}
grid_search = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate =0.05, n_estimators = 680, max_depth = 3,
min_samples_leaf=110, min_samples_split = 310,
random_state=0, subsample = 0.8, max_features = 'sqrt'),
param_grid = params1, scoring='roc_auc',cv=5, n_jobs = -1, return_train_score = True, verbose = 2)
grid_search.fit(X_train,y_train)
final_cv_results = grid_search.cv_results_
final_cv_results = pd.DataFrame(final_cv_results)
final_cv_results[final_cv_results['rank_test_score'] == 1]
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers. [Parallel(n_jobs=-1)]: Done 114 tasks | elapsed: 3.2s [Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed: 6.6s finished
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_n_estimators | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | split0_train_score | split1_train_score | split2_train_score | split3_train_score | split4_train_score | mean_train_score | std_train_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 31 | 0.89049 | 0.010735 | 0.005808 | 0.000241 | 610 | {'n_estimators': 610} | 0.91697 | 0.895164 | 0.865025 | 0.921042 | 0.901056 | 0.899851 | 0.019892 | 1 | 0.997532 | 0.997076 | 0.998501 | 0.996616 | 0.997782 | 0.997501 | 0.000639 |
gbm_m_final = GradientBoostingClassifier(learning_rate =0.05, n_estimators = 610, max_depth = 3, max_features ='sqrt',
min_samples_leaf=110, min_samples_split = 310, subsample=0.8,
random_state=0)
gbm_fit(gbm_m_final, X_train, y_train, req_cv = True, fimp = False)
The roc_auc score is 0.9984
confusion matrix:
[[724 3]
[ 14 231]]
classification report:
precision recall f1-score support
0 0.98 1.00 0.99 727
1 0.99 0.94 0.96 245
accuracy 0.98 972
macro avg 0.98 0.97 0.98 972
weighted avg 0.98 0.98 0.98 972
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
cross validation scores: cv score mean is 0.8998509635313738 cv score std is 0.019891613402591428
[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 0.6s finished
array([0.91696953, 0.89516354, 0.86502463, 0.92104152, 0.90105559])
# results of test dataset
print('\n Predicting for the test dataset \n')
classification_rep(y_test, X_test, gbm_m_final)
results, fpr, tpr = metrics_curves(y_test, X_test, gbm_m_final)
Predicting for the test dataset
The roc_auc score is 0.9087
confusion matrix:
[[175 15]
[ 23 42]]
classification report:
precision recall f1-score support
0 0.88 0.92 0.90 190
1 0.74 0.65 0.69 65
accuracy 0.85 255
macro avg 0.81 0.78 0.80 255
weighted avg 0.85 0.85 0.85 255
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
model_results.loc[2] = ['gbm_final', 'Gradient Boosting', 0.9984, 0.899851,0.019892, 0.9087]
model_results
| Model | Description | Train_auc_mean | cv_auc_mean | cv_auc_std | test_auc | |
|---|---|---|---|---|---|---|
| 0 | log_m_final | logistic regression with l1 penalty | 0.9294 | 0.879800 | 0.014117 | 0.9077 |
| 1 | rf_m_final | Random Forest | 1.0000 | 0.870432 | 0.030980 | 0.8532 |
| 2 | gbm_final | Gradient Boosting | 0.9984 | 0.899851 | 0.019892 | 0.9087 |
model_results
| Model | Description | Train_auc_mean | cv_auc_mean | cv_auc_std | test_auc | |
|---|---|---|---|---|---|---|
| 0 | log_m_final | logistic regression with l1 penalty | 0.9294 | 0.879800 | 0.014117 | 0.9077 |
| 1 | rf_m_final | Random Forest | 1.0000 | 0.870432 | 0.030980 | 0.8532 |
| 2 | gbm_final | Gradient Boosting | 0.9984 | 0.899851 | 0.019892 | 0.9087 |
gbm_results, gbm_fpr, gbm_tpr = metrics_curves(y_test, X_test, gbm_m_final, graphs = False)
rf_results, rf_fpr, rf_tpr = metrics_curves(y_test, X_test, rf_m_final, graphs = False)
log_results, log_fpr, log_tpr = metrics_curves(y_test, X_test, log_m_final, graphs = False)
C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\raviprasad\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1221: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
fig, axes = plt.subplots(3,2, figsize = (18,24))
# ax1.plot(log_fpr, gbm_tpr, label='ROC curve (area = %0.2f)' % 0.9077)
# ax1.plot([0, 1], [0, 1], 'k--')
# ax1.set_xlim([0.0, 1.0])
# ax1.set_ylim([0.0, 1.05])
# aax1.set_xlabel('False Positive Rate or [1 - True Negative Rate]')
# ax1.set_ylabel('True Positive Rate')
# ax1.set_title('Receiver operating characteristic curve')
# ax1.legend(loc="lower right")
# print('-------------- Logistic Regression with l1 penalty --------------------------------------')
axes[0,0].plot(log_fpr, log_tpr, label='ROC curve (area = %0.2f)' % 0.9077)
axes[0,0].plot([0, 1], [0, 1], 'k--')
axes[0,0].set_xlim([0.0, 1.0])
axes[0,0].set_ylim([0.0, 1.05])
axes[0,0].set_xlabel('False Positive Rate or [1 - True Negative Rate]')
axes[0,0].set_ylabel('True Positive Rate')
axes[0,0].set_title('Receiver operating characteristic curve for logistic regression')
axes[0,0].legend(loc="lower right")
axes[0,1].plot(log_results['thresh'].values, log_results['prec'].values, 'g--', label = 'precision')
axes[0,1].legend(loc="lower right")
axes[0,1].plot(log_results['thresh'].values, log_results['recal'].values, 'r--', label = 'recall')
axes[0,1].legend(loc="lower right")
axes[0,1].set_xlim([0.0, 1.0])
axes[0,1].set_ylim([0.0, 1.05])
axes[0,1].set_title('Precision vs Recall Trade-Off Curve for logistic regression')
# print('-------------- Random Forest --------------------------------------')
axes[1,0].plot(rf_fpr, rf_tpr, label='ROC curve (area = %0.2f)' % 0.8532)
axes[1,0].plot([0, 1], [0, 1], 'k--')
axes[1,0].set_xlim([0.0, 1.0])
axes[1,0].set_ylim([0.0, 1.05])
axes[1,0].set_xlabel('False Positive Rate or [1 - True Negative Rate]')
axes[1,0].set_ylabel('True Positive Rate')
axes[1,0].set_title('Receiver operating characteristic curve for Random Forest')
axes[1,0].legend(loc="lower right")
axes[1,1].plot(rf_results['thresh'].values, rf_results['prec'].values, 'g--', label = 'precision')
axes[1,1].legend(loc="lower right")
axes[1,1].plot(rf_results['thresh'].values, rf_results['recal'].values, 'r--', label = 'recall')
axes[1,1].legend(loc="lower right")
axes[1,1].set_xlim([0.0, 1.0])
axes[1,1].set_ylim([0.0, 1.05])
axes[1,1].set_title('Precision vs Recall Trade-Off Curve for Random Forest')
# print('-------------- Random Forest --------------------------------------')
axes[2,0].plot(gbm_fpr, gbm_tpr, label='ROC curve (area = %0.2f)' % 0.9087)
axes[2,0].plot([0, 1], [0, 1], 'k--')
axes[2,0].set_xlim([0.0, 1.0])
axes[2,0].set_ylim([0.0, 1.05])
axes[2,0].set_xlabel('False Positive Rate or [1 - True Negative Rate]')
axes[2,0].set_ylabel('True Positive Rate')
axes[2,0].set_title('Receiver operating characteristic curve for gradient boosting')
axes[2,0].legend(loc="lower right")
axes[2,1].plot(gbm_results['thresh'].values, gbm_results['prec'].values, 'g--', label = 'precision')
axes[2,1].legend(loc="lower right")
axes[2,1].plot(gbm_results['thresh'].values, gbm_results['recal'].values, 'r--', label = 'recall')
axes[2,1].legend(loc="lower right")
axes[2,1].set_xlim([0.0, 1.0])
axes[2,1].set_ylim([0.0, 1.05])
axes[2,1].set_title('Precision vs Recall Trade-Off Curve')
Text(0.5, 1.0, 'Precision vs Recall Trade-Off Curve')
From the above curves
For selecting the optimum thresholds, since there is slight imbalance in the dataset, we cannot use accuracy or precision or recall. we need to use a metric which holds true with slightly im-balanced dataset like this
some of the metrics that can be used are
In this assignment we would be using F1 score for selecting optimum threshold. Lets visualize the F1 scores of the three tuned models (logistic, random forest, gradient boosting) based on these thresholds
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (18,6))
ax1.plot(log_results['thresh'].values, log_results['f1score'].values, 'k--', label = 'F1 score')
ax1.legend(loc="lower right")
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.05])
ax1.set_title('F1score for different thresholds - Logistic Regression')
ax2.plot(rf_results['thresh'].values, rf_results['f1score'].values, 'k--', label = 'F1 score')
ax2.legend(loc="lower right")
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.05])
ax2.set_title('F1score for different thresholds - Random Forest')
ax3.plot(gbm_results['thresh'].values, gbm_results['f1score'].values, 'k--', label = 'F1 score')
ax3.legend(loc="lower right")
ax3.set_xlim([0.0, 1.0])
ax3.set_ylim([0.0, 1.05])
ax3.set_title('F1score for different thresholds - Gradient Boosting Regression Trees')
Text(0.5, 1.0, 'F1score for different thresholds - Gradient Boosting Regression Trees')
# Lets have a further closer look on the thresholds using the charts
F1scores = pd.DataFrame()
F1scores['thresholds'] = log_results['thresh']
# F1scores = F1scores.reset_index(drop = True)
F1scores['log_f1scores'] = log_results['f1score']
F1scores['rf_f1scores'] = rf_results['f1score']
F1scores['gbm_f1scores'] = gbm_results['f1score']
F1scores
| thresholds | log_f1scores | rf_f1scores | gbm_f1scores | |
|---|---|---|---|---|
| 0.00 | 0.00 | 0.406250 | 0.410095 | 0.406250 |
| 0.05 | 0.05 | 0.528926 | 0.443686 | 0.581818 |
| 0.10 | 0.10 | 0.600985 | 0.518219 | 0.645833 |
| 0.15 | 0.15 | 0.677778 | 0.557078 | 0.685714 |
| 0.20 | 0.20 | 0.705202 | 0.578947 | 0.686747 |
| 0.25 | 0.25 | 0.708075 | 0.619048 | 0.692810 |
| 0.30 | 0.30 | 0.697368 | 0.630872 | 0.699301 |
| 0.35 | 0.35 | 0.671329 | 0.618705 | 0.700730 |
| 0.40 | 0.40 | 0.701493 | 0.629921 | 0.687023 |
| 0.45 | 0.45 | 0.709677 | 0.584071 | 0.677165 |
| 0.50 | 0.50 | 0.684211 | 0.490196 | 0.688525 |
| 0.55 | 0.55 | 0.615385 | 0.439560 | 0.689655 |
| 0.60 | 0.60 | 0.591837 | 0.395349 | 0.629630 |
| 0.65 | 0.65 | 0.553191 | 0.365854 | 0.601942 |
| 0.70 | 0.70 | 0.418605 | 0.166667 | 0.510638 |
| 0.75 | 0.75 | 0.404762 | 0.142857 | 0.459770 |
| 0.80 | 0.80 | 0.365854 | 0.059701 | 0.385542 |
| 0.85 | 0.85 | 0.329114 | 0.059701 | 0.266667 |
| 0.90 | 0.90 | 0.263158 | 0.059701 | 0.266667 |
| 0.95 | 0.95 | 0.166667 | 0.000000 | 0.142857 |
| 1.00 | 1.00 | 0.000000 | 0.000000 | 0.000000 |
From the above table, the maximum f1 score and corresponding thresholds are
results = pd.DataFrame({'threshold':[], 'accuracy': [], 'precision': [], 'recall': [], 'F1score': []})
results
| threshold | accuracy | precision | recall | F1score |
|---|
results.loc['Logisitc_Regression'] = log_results.loc[0.45].values
results.loc['Random_Forest'] = log_results.loc[0.3].values
results.loc['GBRT'] = log_results.loc[0.35].values
results
| threshold | accuracy | precision | recall | F1score | |
|---|---|---|---|---|---|
| Logisitc_Regression | 0.45 | 0.858824 | 0.745763 | 0.676923 | 0.709677 |
| Random_Forest | 0.30 | 0.819608 | 0.609195 | 0.815385 | 0.697368 |
| GBRT | 0.35 | 0.815686 | 0.615385 | 0.738462 | 0.671329 |
Preicision and recall both looks good for each of the tuned model
rf_imp = rf_m_final.feature_importances_
log_imp = log_m_final.coef_[0]
gbm_imp = gbm_m_final.feature_importances_
feature_imp = pd.DataFrame({'columns': X_train.columns, 'logistic': log_imp, 'rf': rf_imp, 'GBRT': gbm_imp})
feature_imp
| columns | logistic | rf | GBRT | |
|---|---|---|---|---|
| 0 | LS_1 | 0.000000 | 0.003290 | 0.001664 |
| 1 | PC2 | -0.018388 | 0.000978 | 0.003768 |
| 2 | PC4 | 0.120359 | 0.002385 | 0.003816 |
| 3 | PC5 | 0.000000 | 0.004188 | 0.000764 |
| 4 | PC6 | 0.000000 | 0.003855 | 0.001200 |
| 5 | PC7 | 0.000000 | 0.003941 | 0.001235 |
| 6 | PC13 | 0.000000 | 0.005504 | 0.004945 |
| 7 | PC14 | 0.000000 | 0.003581 | 0.002185 |
| 8 | PC17 | -0.116627 | 0.004238 | 0.002734 |
| 9 | PC8 | 0.000000 | 0.003234 | 0.001364 |
| 10 | PC9 | 0.000000 | 0.003636 | 0.002256 |
| 11 | PC10 | 0.000000 | 0.003538 | 0.000949 |
| 12 | PC11 | 0.000000 | 0.004477 | 0.004081 |
| 13 | PC12 | 0.000000 | 0.003046 | 0.003458 |
| 14 | PC18 | 0.000000 | 0.004051 | 0.000965 |
| 15 | PC22 | 0.000000 | 0.002639 | 0.002314 |
| 16 | PC24 | 0.000000 | 0.003209 | 0.001149 |
| 17 | PC25 | 0.145587 | 0.004810 | 0.000969 |
| 18 | PC26 | -0.024611 | 0.002949 | 0.000085 |
| 19 | PC27 | -0.097742 | 0.007237 | 0.003934 |
| 20 | PC28 | 0.000000 | 0.004217 | 0.000000 |
| 21 | PC29 | -0.048158 | 0.001557 | 0.002786 |
| 22 | PC30 | 0.000000 | 0.002248 | 0.001029 |
| 23 | PC31 | 0.050240 | 0.000215 | 0.000000 |
| 24 | PC32 | -0.050384 | 0.001119 | 0.000000 |
| 25 | PC33 | -0.263899 | 0.008631 | 0.006382 |
| 26 | PC34 | 0.117672 | 0.001900 | 0.003903 |
| 27 | PC35 | 0.000000 | 0.002270 | 0.000746 |
| 28 | PC36 | 0.232092 | 0.008519 | 0.010905 |
| 29 | PC37 | 0.095884 | 0.004004 | 0.001050 |
| 30 | PC38 | 0.015861 | 0.007982 | 0.004911 |
| 31 | PC39 | 0.000000 | 0.003593 | 0.001478 |
| 32 | PC40 | 0.000000 | 0.003552 | 0.002383 |
| 33 | PC41 | 0.000000 | 0.003702 | 0.004166 |
| 34 | PC42 | 0.000000 | 0.006842 | 0.003762 |
| 35 | PC43 | 0.000000 | 0.004442 | 0.005846 |
| 36 | PC44 | 0.001632 | 0.003233 | 0.007984 |
| 37 | PC3 | 0.062117 | 0.002992 | 0.000000 |
| 38 | N_14 | 0.032239 | 0.004417 | 0.008684 |
| 39 | N_15 | 0.000000 | 0.014441 | 0.007660 |
| 40 | N_3 | 0.000000 | 0.003887 | 0.001164 |
| 41 | N_16 | 0.000000 | 0.003531 | 0.001781 |
| 42 | N_18 | 0.000000 | 0.004544 | 0.000789 |
| 43 | N_19 | 0.000000 | 0.005899 | 0.009537 |
| 44 | N_20 | 0.000000 | 0.003350 | 0.001969 |
| 45 | N_21 | 0.000000 | 0.004808 | 0.002106 |
| 46 | N_25 | 0.000000 | 0.003309 | 0.003691 |
| 47 | N_26 | 0.000000 | 0.009109 | 0.016318 |
| 48 | N_27 | 0.000000 | 0.003563 | 0.001815 |
| 49 | N_33 | 0.000000 | 0.004813 | 0.001385 |
| 50 | N_34 | 0.000000 | 0.005525 | 0.006094 |
| 51 | N_35 | -0.157783 | 0.004747 | 0.007560 |
| 52 | N_36 | 0.000000 | 0.003006 | 0.002146 |
| 53 | N_24 | 0.000000 | 0.005319 | 0.001888 |
| 54 | N_31 | -0.181407 | 0.018724 | 0.013378 |
| 55 | N_37 | 0.000000 | 0.005474 | 0.001001 |
| 56 | N_38 | 0.000000 | 0.005473 | 0.000351 |
| 57 | N_39 | 0.000000 | 0.004904 | 0.003537 |
| 58 | N_40 | 0.000000 | 0.003249 | 0.000404 |
| 59 | N_41 | 0.000000 | 0.004442 | 0.002592 |
| 60 | N_42 | 0.000000 | 0.004577 | 0.001836 |
| 61 | N_43 | 0.000000 | 0.006631 | 0.003122 |
| 62 | N_44 | -0.190631 | 0.002480 | 0.012977 |
| 63 | N_45 | -0.050516 | 0.003026 | 0.003355 |
| 64 | N_46 | 0.624409 | 0.033971 | 0.057395 |
| 65 | N_47 | 0.103608 | 0.009467 | 0.007923 |
| 66 | N_48 | 0.081218 | 0.014457 | 0.008085 |
| 67 | N_49 | 0.113475 | 0.009400 | 0.009208 |
| 68 | N_50 | 0.000000 | 0.014447 | 0.024873 |
| 69 | N_51 | 0.071596 | 0.032092 | 0.019146 |
| 70 | N_32 | 0.360388 | 0.006544 | 0.005599 |
| 71 | N_29 | 0.166857 | 0.015150 | 0.019599 |
| 72 | N_1 | 0.000000 | 0.004339 | 0.000694 |
| 73 | N_62 | 0.070344 | 0.003516 | 0.005761 |
| 74 | N_63 | -0.097406 | 0.012473 | 0.020175 |
| 75 | N_30 | 0.000000 | 0.005111 | 0.000840 |
| 76 | N_64 | 0.085456 | 0.014359 | 0.009862 |
| 77 | N_5 | 0.000000 | 0.005037 | 0.003904 |
| 78 | N_6 | 0.000000 | 0.006933 | 0.012635 |
| 79 | N_7 | -0.037335 | 0.006555 | 0.003955 |
| 80 | N_8 | 0.000000 | 0.005901 | 0.001049 |
| 81 | N_9 | 0.000000 | 0.002933 | 0.004224 |
| 82 | N_10 | 0.000000 | 0.002766 | 0.000827 |
| 83 | N_11 | 0.000000 | 0.003893 | 0.002124 |
| 84 | N_52 | 0.000000 | 0.003405 | 0.001435 |
| 85 | N_53 | 0.000000 | 0.002531 | 0.000769 |
| 86 | N_54 | -0.085559 | 0.007969 | 0.001457 |
| 87 | N_13 | 0.000000 | 0.004127 | 0.002664 |
| 88 | N_55 | 0.000000 | 0.004006 | 0.006649 |
| 89 | N_56 | 0.000000 | 0.004514 | 0.003294 |
| 90 | N_57 | -0.029394 | 0.008469 | 0.002335 |
| 91 | N_58 | 0.000000 | 0.003818 | 0.002729 |
| 92 | N_59 | 0.178381 | 0.004182 | 0.000909 |
| 93 | N_60 | 0.000000 | 0.003776 | 0.001566 |
| 94 | N_61 | 0.000000 | 0.004581 | 0.001325 |
| 95 | LC_1 | 0.000000 | 0.000882 | 0.000000 |
| 96 | LC_2 | 0.246561 | 0.006363 | 0.019380 |
| 97 | LC_4 | -0.678322 | 0.020545 | 0.050536 |
| 98 | LC_5 | 0.232494 | 0.012880 | 0.013185 |
| 99 | LC_6 | 0.000000 | 0.000889 | 0.002557 |
| 100 | LC_7 | 0.151278 | 0.005505 | 0.000270 |
| 101 | LC_8 | -0.241201 | 0.004118 | 0.003633 |
| 102 | LC_11 | 0.000000 | 0.003780 | 0.000000 |
| 103 | LC_13 | 0.000000 | 0.001739 | 0.000176 |
| 104 | LC_15 | 0.000000 | 0.001910 | 0.002548 |
| 105 | LC_16 | 0.134900 | 0.002766 | 0.000586 |
| 106 | LC_17 | -0.177456 | 0.001128 | 0.000000 |
| 107 | LC_21 | -0.003004 | 0.001149 | 0.000277 |
| 108 | LC_22 | 0.046502 | 0.002012 | 0.000000 |
| 109 | LC_24 | 0.000000 | 0.002628 | 0.000712 |
| 110 | LC_25 | 0.226625 | 0.002662 | 0.000000 |
| 111 | L_1 | -0.031862 | 0.003220 | 0.004567 |
| 112 | L_2 | 0.000000 | 0.004659 | 0.001930 |
| 113 | L_3 | 0.023510 | 0.010856 | 0.042890 |
| 114 | L_4 | 0.000000 | 0.005133 | 0.001956 |
| 115 | L_5 | 0.113495 | 0.008792 | 0.004282 |
| 116 | L_6 | 0.000000 | 0.003947 | 0.000857 |
| 117 | EC_1 | 0.000000 | 0.004530 | 0.003722 |
| 118 | EC_2 | 0.000000 | 0.004381 | 0.001406 |
| 119 | EC_4 | 0.000000 | 0.004578 | 0.001811 |
| 120 | EC_7 | 0.000000 | 0.004331 | 0.001470 |
| 121 | EC_8 | 0.000000 | 0.014338 | 0.000891 |
| 122 | EC_9 | 0.000000 | 0.024844 | 0.046473 |
| 123 | EC_10 | 0.000000 | 0.004953 | 0.008136 |
| 124 | EC_11 | 0.026755 | 0.022022 | 0.011176 |
| 125 | NERST_nr_car | 0.000000 | 0.005973 | 0.018329 |
| 126 | NERST_nr_mw | 0.013626 | 0.002277 | 0.000985 |
| 127 | NERST_nr_truck | 0.000000 | 0.001069 | 0.000149 |
| 128 | road_density | 0.000000 | 0.005407 | 0.001213 |
| 129 | TRDENS_nr_car | 0.000000 | 0.010252 | 0.006372 |
| 130 | TRDENS_nr_mw | 0.019165 | 0.008765 | 0.007036 |
| 131 | TRDENS_nr_truck | 0.000000 | 0.008505 | 0.003418 |
| 132 | n.accomodation | 0.000000 | 0.000491 | 0.000125 |
| 133 | n.culture | -0.124193 | 0.003778 | 0.000698 |
| 134 | n.education | 0.054249 | 0.001772 | 0.002197 |
| 135 | n.entertainment | 0.000000 | 0.009044 | 0.003946 |
| 136 | n.family | 0.000000 | 0.000449 | 0.000000 |
| 137 | n.fashion | 0.000000 | 0.002806 | 0.000578 |
| 138 | n.food | 0.037996 | 0.010896 | 0.018996 |
| 139 | n.health | 0.000000 | 0.000750 | 0.000528 |
| 140 | n.hobby | 0.124652 | 0.000377 | 0.000000 |
| 141 | n.household | -0.314905 | 0.001845 | 0.000000 |
| 142 | n.money | 0.115007 | 0.003934 | 0.008286 |
| 143 | n.public | 0.143968 | 0.000678 | 0.000000 |
| 144 | n.sport | 0.000000 | 0.002383 | 0.000427 |
| 145 | n.transportation | 0.000000 | 0.002755 | 0.000588 |
| 146 | n.work | 0.000000 | 0.002809 | 0.000686 |
| 147 | min_dist.accomodation | 0.000000 | 0.006985 | 0.015317 |
| 148 | min_dist.culture | -0.308771 | 0.014814 | 0.018062 |
| 149 | min_dist.education | 0.000000 | 0.007394 | 0.012876 |
| 150 | min_dist.entertainment | 0.000000 | 0.007409 | 0.008361 |
| 151 | min_dist.family | 0.000000 | 0.003429 | 0.003238 |
| 152 | min_dist.fashion | -0.103247 | 0.007180 | 0.001287 |
| 153 | min_dist.food | -0.137688 | 0.008158 | 0.014341 |
| 154 | min_dist.health | 0.031735 | 0.004603 | 0.001425 |
| 155 | min_dist.hobby | 0.000000 | 0.003688 | 0.001177 |
| 156 | min_dist.household | -0.055312 | 0.006622 | 0.002375 |
| 157 | min_dist.money | -0.268111 | 0.008334 | 0.011312 |
| 158 | min_dist.public | 0.000000 | 0.006603 | 0.007537 |
| 159 | min_dist.sport | -0.032657 | 0.008086 | 0.009076 |
| 160 | min_dist.transportation | -0.050926 | 0.008050 | 0.004457 |
| 161 | min_dist.work | 0.189640 | 0.005155 | 0.004888 |
| 162 | n_of_nn_chst | -0.107919 | 0.001346 | 0.000248 |
| 163 | min_dist_chst | 0.000000 | 0.003569 | 0.002334 |
| 164 | RoadType_residential | 0.000000 | 0.000813 | 0.000000 |
| 165 | RoadType_secondary | 0.000000 | 0.000107 | 0.000000 |
| 166 | RoadType_tertiary | 0.000000 | 0.000100 | 0.000000 |
| 167 | npoint | 0.595290 | 0.010153 | 0.024598 |
| 168 | max_power | 0.530325 | 0.030850 | 0.058325 |
| 169 | lat | 0.041942 | 0.004478 | 0.003881 |
| 170 | lon | -0.304098 | 0.008528 | 0.013460 |
| 171 | CP_type | 0.466606 | 0.005213 | 0.020122 |
From the above table, logistic with l1 penalty can results in sparse coefficients and therefore feature selection. Lets understand the importand features using logistic regression
# features that can effect popularity of charging pool station based on logistic regression
log_feat_imp = feature_imp[['columns', 'logistic']].sort_values('logistic', axis = 0)
log_feat_imp = log_feat_imp[log_feat_imp['logistic'] != 0].sort_values('logistic', axis = 0, ascending = False)
# selecting features that having importance more than 0.2
log_feat_imp = log_feat_imp[(log_feat_imp['logistic'] >= 0.2) |(log_feat_imp['logistic'] <= -0.2)]
plt.figure(figsize = (18,8))
sns.barplot(x = log_feat_imp['columns'], y = log_feat_imp['logistic'])
plt.xticks(rotation = 90)
plt.title('Major features influencing popularity of charging stations based on Logistic regression')
plt.show()
# features that can effect popularity of charging pool station based on random forest
rf_feat_imp = feature_imp[['columns', 'rf']].sort_values('rf', axis = 0)
# selecting features that having importance more than 0.02
rf_feat_imp = rf_feat_imp[(rf_feat_imp['rf'] >= 0.02) |(rf_feat_imp['rf'] <= -0.02)].sort_values('rf', axis = 0, ascending = False)
plt.figure(figsize = (18,8))
sns.barplot(x = rf_feat_imp['columns'], y = rf_feat_imp['rf'])
plt.xticks(rotation = 90)
plt.title('Major features influencing popularity of charging stations based on Random Forest')
plt.show()
# features that can effect popularity of charging pool station based on Gradient boosted Regression Trees
gbm_feat_imp = feature_imp[['columns', 'GBRT']].sort_values('GBRT', axis = 0)
# selecting features that having importance more than 0.02
gbm_feat_imp = gbm_feat_imp[(gbm_feat_imp['GBRT'] >= 0.02) |(gbm_feat_imp['GBRT'] <= -0.02)].sort_values('GBRT', axis = 0, ascending = False)
plt.figure(figsize = (18,8))
sns.barplot(x = gbm_feat_imp['columns'], y = gbm_feat_imp['GBRT'])
plt.xticks(rotation = 90)
plt.title('Major features influencing popularity of charging stations based on Gradient Boosted Regression Trees')
plt.show()
End of the Document